From 65d6d49d15401a7a67b7be8965b5bc16da5b446c Mon Sep 17 00:00:00 2001 From: lavane Date: Mon, 13 May 2024 13:04:36 +0200 Subject: [PATCH 1/6] starting adding ep-pred --- EAPM/Include/Blocks/Ep-Pred.py | 341 +++++++++++++++++++++++++++++++++ EAPM/plugin.meta | 3 +- 2 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 EAPM/Include/Blocks/Ep-Pred.py diff --git a/EAPM/Include/Blocks/Ep-Pred.py b/EAPM/Include/Blocks/Ep-Pred.py new file mode 100644 index 0000000..e1bb97d --- /dev/null +++ b/EAPM/Include/Blocks/Ep-Pred.py @@ -0,0 +1,341 @@ +import datetime +import os +import subprocess + +from HorusAPI import PluginVariable, SlurmBlock, VariableList, VariableTypes + +# TODO Making the block to work in marenostrum, if not, will work in local. +# For the mn execution set default paths + +# ==========================# +# Variable inputs +# ==========================# +inputFasta = PluginVariable( + name="Input fasta", + id="input_fasta", + description="The input fasta file. (-i)", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["fasta"], +) + +# ==========================# +# Variable outputs +# ==========================# +outputEppred = PluginVariable( + name="EP-Pred output", + id="path", + description="The folder containing the results.", + type=VariableTypes.FOLDER, +) + +############################## +# Other variables # +############################## +pssmDir = PluginVariable( + name="PSSM directory", + id="pssm_dir", + description="The directory containing the PSSM files. (-p)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +fastadir = PluginVariable( + name="Fasta directory", + id="fasta_dir", + description="The directory containing the fasta files. (-f)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +ifeatureDir = PluginVariable( + name="Ifeature directory", + id="ifeature_dir", + description="The directory containing the ifeature files. (-id)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +possumDir = PluginVariable( + name="Possum directory", + id="possum_dir", + description="The directory containing the possum files. (-Po)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +ifeatureOut = PluginVariable( + name="Ifeature out", + id="ifeature_out", + description="The directory where the ifeature features are. (-io)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +possumOut = PluginVariable( + name="Possum out", + id="possum_out", + description="The directory for the possum extractions. (-po)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +filteredOut = PluginVariable( + name="Filtered output", + id="filtered_out", + description="The directory for the filtered features. (-fo)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +dbinp = PluginVariable( + name="Database input", + id="dbinp", + description="The path to the fasta files to create the database. (-di)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +dbout = PluginVariable( + name="Database output", + id="dbout", + description="The path and name of the created database. (-do)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +numThread = PluginVariable( + name="Number of threads", + id="num_thread", + description="The number of threads to use for the generation of pssm profiles and feature extraction. (-n)", + type=VariableTypes.INTEGER, + defaultValue=5, +) +resDir = PluginVariable( + name="Result directory", + id="res_dir", + description="The name for the folder where to store the prediction results. (-rs)", + type=VariableTypes.FOLDER, + defaultValue="results", +) +numSimilarSamples = PluginVariable( + name="Number of similar samples", + id="num_similar_samples", + description="The number of similar training samples to filter the predictions. (-nss)", + type=VariableTypes.INTEGER, + defaultValue=None, +) +restart = PluginVariable( + name="Restart", + id="restart", + description="From which part of the process to restart with. (-re)", + type=VariableTypes.STRING, + allowedValues=["feature", "predict"], +) +filterOnly = PluginVariable( + name="Filter only", + id="filter_only", + description="True if you already have the features. (-on)", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +extractionRestart = PluginVariable( + name="Extraction restart", + id="extraction_restart", + description="The file to restart the extraction with. (-er)", + type=VariableTypes.FILE, +) +long = PluginVariable( + name="Long", + id="long", + description="True when restarting from the long commands. (-lg)", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +run = PluginVariable( + name="Run", + id="run", + description="Run possum or ifeature extraction (-r)", + type=VariableTypes.STRING, + defaultValue=None, + allowedValues=["possum", "ifeature", "both"], +) +start = PluginVariable( + name="Start", + id="start", + description="The starting number. (-st)", + type=VariableTypes.INTEGER, + defaultValue=None, +) +end = PluginVariable( + name="End", + id="end", + description="The ending number, not included. (-en)", + type=VariableTypes.INTEGER, + defaultValue=None, +) +sbatchPath = PluginVariable( + name="Sbatch path", + id="sbatch_path", + description="The folder to keep the run files for generating pssm. (-sp)", + type=VariableTypes.FOLDER, + defaultValue=None, +) +value = PluginVariable( + name="Value", + id="value", + description="The voting threshold to be considered positive. (-v)", + type=VariableTypes.FLOAT, + defaultValue=None, + allowedValues=[1, 0.8, 0.5], +) +iterations = PluginVariable( + name="Iterations", + id="iterations", + description="The number of iterations in the PSIBlast. (-iter)", + type=VariableTypes.INTEGER, + defaultValue=None, +) + + +def runEppred(block: SlurmBlock): + + inputfasta = block.inputs.get("input_fasta", None) + + if inputfasta is None: + raise Exception("No input fasta provided") + if not os.path.exists(inputfasta): + raise Exception(f"The input fasta file does not exist: {inputfasta}") + + command = "python -m ep_pred.Launch " + command += f"-i {inputfasta} " + command += f"-n {block.variables.get('num_thread', 5)} " + command += f"-re {block.variables.get('restart', 'predict')} " + + pssm_dir = block.variables.get("pssm_dir", None) + if pssm_dir is not None: + command += f"-p {pssm_dir} " + fasta_dir = block.variables.get("fasta_dir", None) + if fasta_dir is not None: + command += f"-f {fasta_dir} " + ifeature_dir = block.variables.get("ifeature_dir", None) + if ifeature_dir is not None: + command += f"-id {ifeature_dir} " + possum_dir = block.variables.get("possum_dir", None) + if possum_dir is not None: + command += f"-Po {possum_dir} " + ifeature_out = block.variables.get("ifeature_out", None) + if ifeature_out is not None: + command += f"-io {ifeature_out} " + possum_out = block.variables.get("possum_out", None) + if possum_out is not None: + command += f"-po {possum_out} " + filtered_out = block.variables.get("filtered_out", None) + if filtered_out is not None: + command += f"-fo {filtered_out} " + dbinp = block.variables.get("dbinp", None) + if dbinp is not None: + command += f"-di {dbinp} " + dbout = block.variables.get("dbout", None) + if dbout is not None: + command += f"-do {dbout} " + res_dir = block.variables.get("res_dir", None) + if res_dir is not None: + command += f"-rs {res_dir} " + num_similar_samples = block.variables.get("num_similar_samples", None) + if num_similar_samples is not None: + command += f"-nss {num_similar_samples} " + restart = block.variables.get("restart", None) + if restart is not None: + command += f"-re {restart} " + filter_only = block.variables.get("filter_only", None) + if filter_only is not None: + command += f"-on {filter_only} " + extraction_restart = block.variables.get("extraction_restart", None) + if extraction_restart is not None: + command += f"-er {extraction_restart} " + long = block.variables.get("long", None) + if long is not None: + command += f"-lg {long} " + run = block.variables.get("run", None) + if run is not None: + command += f"-r {run} " + start = block.variables.get("start", None) + if start is not None: + command += f"-st {start} " + end = block.variables.get("end", None) + if end is not None: + command += f"-en {end} " + sbatch_path = block.variables.get("sbatch_path", None) + if sbatch_path is not None: + command += f"-sp {sbatch_path} " + value = block.variables.get("value", None) + if value is not None: + command += f"-v {value} " + iterations = block.variables.get("iterations", None) + if iterations is not None: + command += f"-iter {iterations} " + + jobs = [command] + + folderName = block.variables.get("folder_name", "epPred") + block.extraData["folder_name"] = folderName + removeExisting = block.variables.get("remove_existing_results", False) + + # If folder already exists, raise exception + if removeExisting and os.path.exists(folderName): + os.system("rm -rf " + folderName) + + if not removeExisting and os.path.exists(folderName): + raise Exception( + "The folder {} already exists. Please, choose another name or remove it.".format( + folderName + ) + ) + + # Create an copy the inputs + os.makedirs(folderName, exist_ok=True) + os.system(f"cp {inputfasta} {folderName}") + + from utils import launchCalculationAction + + launchCalculationAction( + block, + jobs, + program="epPred", + uploadFolders=[ + folderName, + ], + ) + + +def finalAction(block: SlurmBlock): + pass + + +from utils import BSC_JOB_VARIABLES + +epPredBlock = SlurmBlock( + name="Ep-pred", + initialAction=runEppred, + finalAction=finalAction, + description="A machine learning program to predict promiscuity of esterases.", + inputs=[inputFasta], + variables=BSC_JOB_VARIABLES + + [ + pssmDir, + fastadir, + ifeatureDir, + possumDir, + ifeatureOut, + possumOut, + filteredOut, + dbinp, + dbout, + numThread, + resDir, + numSimilarSamples, + restart, + filterOnly, + extractionRestart, + long, + run, + start, + end, + sbatchPath, + value, + iterations, + ], + outputs=[outputEppred], +) diff --git a/EAPM/plugin.meta b/EAPM/plugin.meta index 62dacb3..77eca32 100644 --- a/EAPM/plugin.meta +++ b/EAPM/plugin.meta @@ -21,6 +21,7 @@ "git+https://github.com/Martin-Floor/bsc_calculations.git", "git+https://github.com/Martin-Floor/prepare_proteins.git", "git+https://github.com/Martin-Floor/PELE_scripts.git", - "biopython==1.81" + "biopython==1.81", + "git+https://github.com/etiur/EP-pred" ] } From 17d8a3ccb6f5d0ea50fc89e7a4352feb88d7c9e6 Mon Sep 17 00:00:00 2001 From: lavane Date: Wed, 15 May 2024 18:09:54 +0200 Subject: [PATCH 2/6] Starting wiht the Bioml --- EAPM/EAPM.py | 4 + EAPM/Include/Blocks/Ahatool.py | 2 + EAPM/Include/Blocks/AlignPdbEAPM.py | 5 - EAPM/Include/Blocks/AlphaFoldEAPM.py | 1 - EAPM/Include/Blocks/ClassificationBioMl.py | 327 ++++++++++++++++++ EAPM/Include/Blocks/{Ep-Pred.py => EpPred.py} | 7 +- EAPM/plugin.meta | 3 +- 7 files changed, 338 insertions(+), 11 deletions(-) create mode 100644 EAPM/Include/Blocks/ClassificationBioMl.py rename EAPM/Include/Blocks/{Ep-Pred.py => EpPred.py} (98%) diff --git a/EAPM/EAPM.py b/EAPM/EAPM.py index fa79928..58ca75d 100644 --- a/EAPM/EAPM.py +++ b/EAPM/EAPM.py @@ -91,6 +91,10 @@ def createPlugin(): eapmPlugin.addBlock(ahatoolBlock) + from Blocks.EpPred import epPredBlock # type: ignore + + eapmPlugin.addBlock(epPredBlock) + # Add the configs from Configs.mafftConfig import mafftExecutableConfig # type: ignore diff --git a/EAPM/Include/Blocks/Ahatool.py b/EAPM/Include/Blocks/Ahatool.py index 14618eb..a7166ce 100644 --- a/EAPM/Include/Blocks/Ahatool.py +++ b/EAPM/Include/Blocks/Ahatool.py @@ -4,6 +4,8 @@ from HorusAPI import PluginBlock, PluginVariable, VariableList, VariableTypes +# TODO Add to the documentation + # ==========================# # Variable inputs # ==========================# diff --git a/EAPM/Include/Blocks/AlignPdbEAPM.py b/EAPM/Include/Blocks/AlignPdbEAPM.py index 989d4ae..5593445 100644 --- a/EAPM/Include/Blocks/AlignPdbEAPM.py +++ b/EAPM/Include/Blocks/AlignPdbEAPM.py @@ -45,7 +45,6 @@ type=VariableTypes.INTEGER, defaultValue=0, ) - chainIndexesAlign = VariableList( name="Chain indexes", id="chain_indexes", @@ -63,15 +62,12 @@ type=VariableTypes.INTEGER, defaultValue=0, ) - trajectoryChainIndexesAlign = VariableList( name="Trajectory chain indexes", id="trajectory_chain_indexes", description="Chain indexes of the target trajectories to use in the alignment.", prototypes=[trajectoryChainIndexVariable], ) - - alignmentModeAlign = PluginVariable( name="Alignment mode", id="alignment_mode", @@ -80,7 +76,6 @@ defaultValue="aligned", allowedValues=["aligned", "exact"], ) - referenceResiduesAlign = PluginVariable( name="Reference residue index", id="reference_residues", diff --git a/EAPM/Include/Blocks/AlphaFoldEAPM.py b/EAPM/Include/Blocks/AlphaFoldEAPM.py index ce1a7de..8788b06 100644 --- a/EAPM/Include/Blocks/AlphaFoldEAPM.py +++ b/EAPM/Include/Blocks/AlphaFoldEAPM.py @@ -28,7 +28,6 @@ type=VariableTypes.STRING, defaultValue="alphafold", ) - removeExistingResults = PluginVariable( name="Remove existing results", id="remove_existing_results", diff --git a/EAPM/Include/Blocks/ClassificationBioMl.py b/EAPM/Include/Blocks/ClassificationBioMl.py new file mode 100644 index 0000000..c61351b --- /dev/null +++ b/EAPM/Include/Blocks/ClassificationBioMl.py @@ -0,0 +1,327 @@ +""" +Bioml Classification + | Wrapper class for the bioml Classification module. + | Train classification models. +""" + +# TODO Add to the documentation + +import os + +from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +inputLabelFile = PluginVariable( + name="Input Label", + id="input_label", + description="The path to the labels of the training set in a csv format or string if it is inside training features.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["csv"], +) +inputLabelString = PluginVariable( + name="Input Label", + id="input_label", + description="The labels of the training set in a string format.", + type=VariableTypes.STRING, + defaultValue=None, +) +trainingFeatures = PluginVariable( + name="Training Features", + id="training_features", + description="The file to where the training features are saved in excel or csv format.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["csv", "xlsx"], +) +fileGroup = VariableGroup( + id="fileType_input", + name="Input File", + description="The input is a file", + variables=[inputLabelFile, trainingFeatures], +) +stringGroup = VariableGroup( + id="stringType_input", + name="Input String", + description="The input is a string", + variables=[inputLabelString, trainingFeatures], +) + +# ==========================# +# Variable outputs +# ==========================# +outputClassification = PluginVariable( + name="Ensemble output", + id="out_zip", + description="The zip file to the output for the ensemble results", + type=VariableTypes.FILE, +) + +############################## +# Other variables # +############################## +trainingOutput = PluginVariable( + name="Training Output", + id="training_output", + description="The path where to save the models training results.", + type=VariableTypes.FOLDER, + defaultValue=None, +) +scalerVar = PluginVariable( + name="Scaler", + id="scaler", + description="Choose one of the scaler available in scikit-learn, defaults to zscore.", + type=VariableTypes.STRING_LIST, + allowedValues=["robust", "zscore", "minmax"] + defaultValue=None, +) +kfoldParameters = PluginVariable( + name="Kfold Parameters", + id="kfold_parameters", + description="The parameters for the kfold in num_split:test_size format ('5:0.2').", + type=VariableTypes.STRING, + defaultValue=None, +) +outliersVar = PluginVariable( + name="Outliers", + id="outliers", + description="A list of outliers if any, the name should be the same as in the excel file with the filtered features, you can also specify the path to a file in plain text format, each record should be in a new line", + type=VariableTypes.STRING, + defaultValue=None, +) +budgetTime = PluginVariable( + name="Budget Time", + id="budget_time", + description="The time budget for the training in minutes, should be > 0 or None.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +precisionWeight = PluginVariable( + name="Precision Weight", + id="precision_weight", + description="Weights to specify how relevant is the precision for the ranking of the different features.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +recallWeight = PluginVariable( + name="Recall Weight", + id="recall_weight", + description="Weights to specify how relevant is the recall for the ranking of the different features.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +reportWeight = PluginVariable( + name="Report Weight", + id="report_weight", + description="Weights to specify how relevant is the f1, precision and recall for the ranking of the different features with respect to MCC which is a more general measures of the performance of a model.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +differenceWeight = PluginVariable( + name="Difference Weight", + id="difference_weight", + description="How important is to have similar training and test metrics.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +bestModels = PluginVariable( + name="Best Models", + id="best_models", + description="The number of best models to select, it affects the analysis and the saved hyperparameters.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +seedVar = PluginVariable( + name="Seed", + id="seed", + description="The seed for the random state.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +dropVar = PluginVariable( + name="Drop", + id="drop", + description="The models to drop.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=['lr','knn','nb','dt','svm','rbfsvm','gpc','mlp','ridge','rf','qda','ada','gbc','lda','et','xgboost','lightgbm','catboost','dummy'] +) +selectedVar = PluginVariable( + name="Selected", + id="selected", + description="The models to select.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=['lr','knn','nb','dt','svm','rbfsvm','gpc','mlp','ridge','rf','qda','ada','gbc','lda','et','xgboost','lightgbm','catboost','dummy'] +) +tuneVar = PluginVariable( + name="Tune", + id="tune", + description="If to tune the best models.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +plotVar = PluginVariable( + name="Plot", + id="plot", + description="The plots to save.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"] +) +optimizeVar = PluginVariable( + name="Optimize", + id="optimize", + description="The metric to optimize for retuning the best models.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"] +) +sheetName = PluginVariable( + name="Sheet Name", + id="sheet_name", + description="The sheet name for the excel file if the training features is in excel format.", + type=VariableTypes.STRING, + defaultValue=None, +) +numIter = PluginVariable( + name="Number of Iterations", + id="num_iter", + description="The number of iterations for the hyperparameter search.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +splitStrategy = PluginVariable( + name="Split Strategy", + id="split_strategy", + description="The strategy to split the data.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"] +) +clusterVar = PluginVariable( + name="Cluster", + id="cluster", + description="The path to the cluster file generated by mmseqs2 or a custom group index file just like data/resultsDB_clu.tsv.", + type=VariableTypes.FILE, + defaultValue=None, +) +mutationsVar = PluginVariable( + name="Mutations", + id="mutations", + description="The column name of the mutations in the training data.", + type=VariableTypes.STRING, + defaultValue=None, +) +testNumMutations = PluginVariable( + name="Test Number of Mutations", + id="test_num_mutations", + description="The threshold of number of mutations to be included in the test set.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +greaterVar = PluginVariable( + name="Greater", + id="greater", + description="Include in the test set, mutations that are greater of less than the threshold, default greater.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +shuffleVar = PluginVariable( + name="Shuffle", + id="shuffle", + description="If to shuffle the data before splitting.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +crossValidation = PluginVariable( + name="Cross Validation", + id="cross_validation", + description="If to use cross validation, default is True.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) + +def runClassificationBioml(block: SlurmBlock): + + input_excel = block.inputs.get("input_excel", None) + if input_excel is None: + raise Exception("No input excel provided") + if not os.path.exists(input_excel): + raise Exception(f"The input excel file does not exist: {input_excel}") + + input_hyperparameters = block.inputs.get("input_hyperparameters", None) + if input_hyperparameters is None: + raise Exception("No input hyperparameters provided") + if not os.path.exists(input_hyperparameters): + raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") + + input_sheets = block.inputs.get("input_sheets", None) + if input_sheets is None: + raise Exception("No input sheets provided") + + input_label = block.inputs.get("input_label", None) + if input_label is None: + raise Exception("No input label provided") + if not os.path.exists(input_label): + raise Exception(f"The input label file does not exist: {input_label}") + + command = "python -m BioML.Ensemble " + command += f"--excel {input_excel} " + command += f"--hyperparameter_path {input_hyperparameters} " + command += f"--sheets {input_sheets} " + command += f"--label {input_label} " + + jobs = [command] + + folderName = block.variables.get("folder_name", "ensembleBioml") + block.extraData["folder_name"] = folderName + removeExisting = block.variables.get("remove_existing_results", False) + + # If folder already exists, raise exception + if removeExisting and os.path.exists(folderName): + os.system("rm -rf " + folderName) + + if not removeExisting and os.path.exists(folderName): + raise Exception( + "The folder {} already exists. Please, choose another name or remove it.".format( + folderName + ) + ) + + # Create an copy the inputs + os.makedirs(folderName, exist_ok=True) + os.system(f"cp {input_excel} {folderName}") + os.system(f"cp {input_hyperparameters} {folderName}") + os.system(f"cp {input_sheets} {folderName}") + os.system(f"cp {input_label} {folderName}") + + from utils import launchCalculationAction + + launchCalculationAction( + block, + jobs, + program="bioml", + uploadFolders=[ + folderName, + ], + ) + + +def finalAction(block: SlurmBlock): + pass + + +from utils import BSC_JOB_VARIABLES + +classificationBioMLBlock = SlurmBlock( + name="Ep-pred", + initialAction=runClassificationBioml, + finalAction=finalAction, + description="Train classification models.", + inputGroups=[fileGroup, stringGroup], + variables=BSC_JOB_VARIABLES + [], + outputs=[outputClassification], +) diff --git a/EAPM/Include/Blocks/Ep-Pred.py b/EAPM/Include/Blocks/EpPred.py similarity index 98% rename from EAPM/Include/Blocks/Ep-Pred.py rename to EAPM/Include/Blocks/EpPred.py index e1bb97d..f30ef04 100644 --- a/EAPM/Include/Blocks/Ep-Pred.py +++ b/EAPM/Include/Blocks/EpPred.py @@ -5,6 +5,7 @@ from HorusAPI import PluginVariable, SlurmBlock, VariableList, VariableTypes # TODO Making the block to work in marenostrum, if not, will work in local. +# TODO Add to documentation # For the mn execution set default paths # ==========================# @@ -120,7 +121,7 @@ name="Restart", id="restart", description="From which part of the process to restart with. (-re)", - type=VariableTypes.STRING, + type=VariableTypes.STRING_LIST, allowedValues=["feature", "predict"], ) filterOnly = PluginVariable( @@ -176,7 +177,7 @@ name="Value", id="value", description="The voting threshold to be considered positive. (-v)", - type=VariableTypes.FLOAT, + type=VariableTypes.NUMBER_LIST, defaultValue=None, allowedValues=[1, 0.8, 0.5], ) @@ -236,7 +237,7 @@ def runEppred(block: SlurmBlock): num_similar_samples = block.variables.get("num_similar_samples", None) if num_similar_samples is not None: command += f"-nss {num_similar_samples} " - restart = block.variables.get("restart", None) + restart = block.variables.get("restart", "feature") if restart is not None: command += f"-re {restart} " filter_only = block.variables.get("filter_only", None) diff --git a/EAPM/plugin.meta b/EAPM/plugin.meta index 77eca32..62dacb3 100644 --- a/EAPM/plugin.meta +++ b/EAPM/plugin.meta @@ -21,7 +21,6 @@ "git+https://github.com/Martin-Floor/bsc_calculations.git", "git+https://github.com/Martin-Floor/prepare_proteins.git", "git+https://github.com/Martin-Floor/PELE_scripts.git", - "biopython==1.81", - "git+https://github.com/etiur/EP-pred" + "biopython==1.81" ] } From 052164579e03c3c4877ee318406d199e8f4b9e86 Mon Sep 17 00:00:00 2001 From: AlbertCS Date: Thu, 16 May 2024 10:42:58 +0200 Subject: [PATCH 3/6] added new blocks --- EAPM/EAPM.py | 16 + EAPM/Include/Blocks/ClassificationBioMl.py | 88 ++++- EAPM/Include/Blocks/PredictBioML.py | 360 +++++++++++++++++++ EAPM/Include/Blocks/RegressionBioMl.py | 391 +++++++++++++++++++++ EAPM/Include/Blocks/outliersBioMl.py | 152 ++++++++ EAPM/Include/Blocks/splitMethodsBioMl.py | 0 6 files changed, 997 insertions(+), 10 deletions(-) create mode 100644 EAPM/Include/Blocks/PredictBioML.py create mode 100644 EAPM/Include/Blocks/RegressionBioMl.py create mode 100644 EAPM/Include/Blocks/outliersBioMl.py create mode 100644 EAPM/Include/Blocks/splitMethodsBioMl.py diff --git a/EAPM/EAPM.py b/EAPM/EAPM.py index 58ca75d..4f0a314 100644 --- a/EAPM/EAPM.py +++ b/EAPM/EAPM.py @@ -95,6 +95,22 @@ def createPlugin(): eapmPlugin.addBlock(epPredBlock) + from Blocks.ClassificationBioMl import classificationBioMLBlock # type: ignore + + eapmPlugin.addBlock(classificationBioMLBlock) + + from Blocks.RegressionBioMl import regressionBioMLBlock # type: ignore + + eapmPlugin.addBlock(regressionBioMLBlock) + + from Blocks.PredictBioML import PredictBioMLBlock # type: ignore + + eapmPlugin.addBlock(PredictBioMLBlock) + + from Blocks.outliersBioMl import outliersBioMLBlock # type: ignore + + eapmPlugin.addBlock(outliersBioMLBlock) + # Add the configs from Configs.mafftConfig import mafftExecutableConfig # type: ignore diff --git a/EAPM/Include/Blocks/ClassificationBioMl.py b/EAPM/Include/Blocks/ClassificationBioMl.py index c61351b..3300f9f 100644 --- a/EAPM/Include/Blocks/ClassificationBioMl.py +++ b/EAPM/Include/Blocks/ClassificationBioMl.py @@ -53,9 +53,9 @@ # Variable outputs # ==========================# outputClassification = PluginVariable( - name="Ensemble output", + name="Classification output", id="out_zip", - description="The zip file to the output for the ensemble results", + description="The zip file to the output for the classification models", type=VariableTypes.FILE, ) @@ -74,7 +74,7 @@ id="scaler", description="Choose one of the scaler available in scikit-learn, defaults to zscore.", type=VariableTypes.STRING_LIST, - allowedValues=["robust", "zscore", "minmax"] + allowedValues=["robust", "zscore", "minmax"], defaultValue=None, ) kfoldParameters = PluginVariable( @@ -146,7 +146,27 @@ description="The models to drop.", type=VariableTypes.STRING_LIST, defaultValue=None, - allowedValues=['lr','knn','nb','dt','svm','rbfsvm','gpc','mlp','ridge','rf','qda','ada','gbc','lda','et','xgboost','lightgbm','catboost','dummy'] + allowedValues=[ + "lr", + "knn", + "nb", + "dt", + "svm", + "rbfsvm", + "gpc", + "mlp", + "ridge", + "rf", + "qda", + "ada", + "gbc", + "lda", + "et", + "xgboost", + "lightgbm", + "catboost", + "dummy", + ], ) selectedVar = PluginVariable( name="Selected", @@ -154,7 +174,27 @@ description="The models to select.", type=VariableTypes.STRING_LIST, defaultValue=None, - allowedValues=['lr','knn','nb','dt','svm','rbfsvm','gpc','mlp','ridge','rf','qda','ada','gbc','lda','et','xgboost','lightgbm','catboost','dummy'] + allowedValues=[ + "lr", + "knn", + "nb", + "dt", + "svm", + "rbfsvm", + "gpc", + "mlp", + "ridge", + "rf", + "qda", + "ada", + "gbc", + "lda", + "et", + "xgboost", + "lightgbm", + "catboost", + "dummy", + ], ) tuneVar = PluginVariable( name="Tune", @@ -169,7 +209,7 @@ description="The plots to save.", type=VariableTypes.STRING_LIST, defaultValue=None, - allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"] + allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"], ) optimizeVar = PluginVariable( name="Optimize", @@ -177,7 +217,7 @@ description="The metric to optimize for retuning the best models.", type=VariableTypes.STRING_LIST, defaultValue=None, - allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"] + allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"], ) sheetName = PluginVariable( name="Sheet Name", @@ -199,7 +239,7 @@ description="The strategy to split the data.", type=VariableTypes.STRING_LIST, defaultValue=None, - allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"] + allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"], ) clusterVar = PluginVariable( name="Cluster", @@ -244,6 +284,7 @@ defaultValue=None, ) + def runClassificationBioml(block: SlurmBlock): input_excel = block.inputs.get("input_excel", None) @@ -317,11 +358,38 @@ def finalAction(block: SlurmBlock): from utils import BSC_JOB_VARIABLES classificationBioMLBlock = SlurmBlock( - name="Ep-pred", + name="Classification BioML", initialAction=runClassificationBioml, finalAction=finalAction, description="Train classification models.", inputGroups=[fileGroup, stringGroup], - variables=BSC_JOB_VARIABLES + [], + variables=BSC_JOB_VARIABLES + + [ + selectedVar, + dropVar, + trainingOutput, + scalerVar, + kfoldParameters, + outliersVar, + budgetTime, + precisionWeight, + recallWeight, + reportWeight, + differenceWeight, + bestModels, + seedVar, + tuneVar, + plotVar, + optimizeVar, + sheetName, + numIter, + splitStrategy, + clusterVar, + mutationsVar, + testNumMutations, + greaterVar, + shuffleVar, + crossValidation, + ], outputs=[outputClassification], ) diff --git a/EAPM/Include/Blocks/PredictBioML.py b/EAPM/Include/Blocks/PredictBioML.py new file mode 100644 index 0000000..1be5f3c --- /dev/null +++ b/EAPM/Include/Blocks/PredictBioML.py @@ -0,0 +1,360 @@ +""" +A module that performs regression analysis on a dataset. +""" + +import os + +from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +fastaFile = PluginVariable( + name="Fasta file", + id="fasta_file", + description="The fasta file path.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["fasta"], +) +modelPath = PluginVariable( + name="Model Path", + id="model_path", + description="The path to the model.", + type=VariableTypes.FILE, + defaultValue=None, +) +testFeatures = PluginVariable( + name="Test Features", + id="test_features", + description="The file to where the test features are saved in excel or csv format.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["csv", "xlsx"], +) +trainingFeatures = PluginVariable( + name="Training Features", + id="training_features", + description="The file to where the training features are saved in excel or csv format.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["csv", "xlsx"], +) + +# ==========================# +# Variable outputs +# ==========================# +outputPrediction = PluginVariable( + name="Prediction output", + id="out_zip", + description="The zip file to the output for the prediction results", + type=VariableTypes.FILE, +) + +############################## +# Other variables # +############################## +trainingOutput = PluginVariable( + name="Training Output", + id="training_output", + description="The path where to save the models training results.", + type=VariableTypes.FOLDER, + defaultValue=None, +) +scalerVar = PluginVariable( + name="Scaler", + id="scaler", + description="Choose one of the scaler available in scikit-learn, defaults to zscore.", + type=VariableTypes.STRING_LIST, + allowedValues=["robust", "zscore", "minmax"], + defaultValue=None, +) +kfoldParameters = PluginVariable( + name="Kfold Parameters", + id="kfold_parameters", + description="The parameters for the kfold in num_split:test_size format ('5:0.2').", + type=VariableTypes.STRING, + defaultValue=None, +) +outliersVar = PluginVariable( + name="Outliers", + id="outliers", + description="A list of outliers if any, the name should be the same as in the excel file with the filtered features, you can also specify the path to a file in plain text format, each record should be in a new line", + type=VariableTypes.STRING, + defaultValue=None, +) +budgetTime = PluginVariable( + name="Budget Time", + id="budget_time", + description="The time budget for the training in minutes, should be > 0 or None.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +precisionWeight = PluginVariable( + name="Precision Weight", + id="precision_weight", + description="Weights to specify how relevant is the precision for the ranking of the different features.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +recallWeight = PluginVariable( + name="Recall Weight", + id="recall_weight", + description="Weights to specify how relevant is the recall for the ranking of the different features.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +reportWeight = PluginVariable( + name="Report Weight", + id="report_weight", + description="Weights to specify how relevant is the f1, precision and recall for the ranking of the different features with respect to MCC which is a more general measures of the performance of a model.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +differenceWeight = PluginVariable( + name="Difference Weight", + id="difference_weight", + description="How important is to have similar training and test metrics.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +bestModels = PluginVariable( + name="Best Models", + id="best_models", + description="The number of best models to select, it affects the analysis and the saved hyperparameters.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +seedVar = PluginVariable( + name="Seed", + id="seed", + description="The seed for the random state.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +dropVar = PluginVariable( + name="Drop", + id="drop", + description="The models to drop.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=[ + "lr", + "knn", + "nb", + "dt", + "svm", + "rbfsvm", + "gpc", + "mlp", + "ridge", + "rf", + "qda", + "ada", + "gbc", + "lda", + "et", + "xgboost", + "lightgbm", + "catboost", + "dummy", + ], +) +selectedVar = PluginVariable( + name="Selected", + id="selected", + description="The models to select.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=[ + "lr", + "knn", + "nb", + "dt", + "svm", + "rbfsvm", + "gpc", + "mlp", + "ridge", + "rf", + "qda", + "ada", + "gbc", + "lda", + "et", + "xgboost", + "lightgbm", + "catboost", + "dummy", + ], +) +tuneVar = PluginVariable( + name="Tune", + id="tune", + description="If to tune the best models.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +plotVar = PluginVariable( + name="Plot", + id="plot", + description="The plots to save.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"], +) +optimizeVar = PluginVariable( + name="Optimize", + id="optimize", + description="The metric to optimize for retuning the best models.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"], +) +sheetName = PluginVariable( + name="Sheet Name", + id="sheet_name", + description="The sheet name for the excel file if the training features is in excel format.", + type=VariableTypes.STRING, + defaultValue=None, +) +numIter = PluginVariable( + name="Number of Iterations", + id="num_iter", + description="The number of iterations for the hyperparameter search.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +splitStrategy = PluginVariable( + name="Split Strategy", + id="split_strategy", + description="The strategy to split the data.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"], +) +clusterVar = PluginVariable( + name="Cluster", + id="cluster", + description="The path to the cluster file generated by mmseqs2 or a custom group index file just like data/resultsDB_clu.tsv.", + type=VariableTypes.FILE, + defaultValue=None, +) +mutationsVar = PluginVariable( + name="Mutations", + id="mutations", + description="The column name of the mutations in the training data.", + type=VariableTypes.STRING, + defaultValue=None, +) +testNumMutations = PluginVariable( + name="Test Number of Mutations", + id="test_num_mutations", + description="The threshold of number of mutations to be included in the test set.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +greaterVar = PluginVariable( + name="Greater", + id="greater", + description="Include in the test set, mutations that are greater of less than the threshold, default greater.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +shuffleVar = PluginVariable( + name="Shuffle", + id="shuffle", + description="If to shuffle the data before splitting.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +crossValidation = PluginVariable( + name="Cross Validation", + id="cross_validation", + description="If to use cross validation, default is True.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) + + +def runClassificationBioml(block: SlurmBlock): + + input_excel = block.inputs.get("input_excel", None) + if input_excel is None: + raise Exception("No input excel provided") + if not os.path.exists(input_excel): + raise Exception(f"The input excel file does not exist: {input_excel}") + + input_hyperparameters = block.inputs.get("input_hyperparameters", None) + if input_hyperparameters is None: + raise Exception("No input hyperparameters provided") + if not os.path.exists(input_hyperparameters): + raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") + + input_sheets = block.inputs.get("input_sheets", None) + if input_sheets is None: + raise Exception("No input sheets provided") + + input_label = block.inputs.get("input_label", None) + if input_label is None: + raise Exception("No input label provided") + if not os.path.exists(input_label): + raise Exception(f"The input label file does not exist: {input_label}") + + command = "python -m BioML.Ensemble " + command += f"--excel {input_excel} " + command += f"--hyperparameter_path {input_hyperparameters} " + command += f"--sheets {input_sheets} " + command += f"--label {input_label} " + + jobs = [command] + + folderName = block.variables.get("folder_name", "ensembleBioml") + block.extraData["folder_name"] = folderName + removeExisting = block.variables.get("remove_existing_results", False) + + # If folder already exists, raise exception + if removeExisting and os.path.exists(folderName): + os.system("rm -rf " + folderName) + + if not removeExisting and os.path.exists(folderName): + raise Exception( + "The folder {} already exists. Please, choose another name or remove it.".format( + folderName + ) + ) + + # Create an copy the inputs + os.makedirs(folderName, exist_ok=True) + os.system(f"cp {input_excel} {folderName}") + os.system(f"cp {input_hyperparameters} {folderName}") + os.system(f"cp {input_sheets} {folderName}") + os.system(f"cp {input_label} {folderName}") + + from utils import launchCalculationAction + + launchCalculationAction( + block, + jobs, + program="bioml", + uploadFolders=[ + folderName, + ], + ) + + +def finalAction(block: SlurmBlock): + pass + + +from utils import BSC_JOB_VARIABLES + +PredictBioMLBlock = SlurmBlock( + name="Predict BioMl", + initialAction=runClassificationBioml, + finalAction=finalAction, + description="Predict using the models and average the votations.", + inputs=[fastaFile, modelPath, testFeatures, trainingFeatures], + variables=BSC_JOB_VARIABLES + [], + outputs=[outputPrediction], +) diff --git a/EAPM/Include/Blocks/RegressionBioMl.py b/EAPM/Include/Blocks/RegressionBioMl.py new file mode 100644 index 0000000..0da2825 --- /dev/null +++ b/EAPM/Include/Blocks/RegressionBioMl.py @@ -0,0 +1,391 @@ +""" +A module that performs regression analysis on a dataset. +""" + +import os + +from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +inputLabelFile = PluginVariable( + name="Input Label", + id="input_label", + description="The path to the labels of the training set in a csv format or string if it is inside training features.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["csv"], +) +inputLabelString = PluginVariable( + name="Input Label", + id="input_label", + description="The labels of the training set in a string format.", + type=VariableTypes.STRING, + defaultValue=None, +) +trainingFeatures = PluginVariable( + name="Training Features", + id="training_features", + description="The file to where the training features are saved in excel or csv format.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["csv", "xlsx"], +) +fileGroup = VariableGroup( + id="fileType_input", + name="Input File", + description="The input is a file", + variables=[inputLabelFile, trainingFeatures], +) +stringGroup = VariableGroup( + id="stringType_input", + name="Input String", + description="The input is a string", + variables=[inputLabelString, trainingFeatures], +) + +# ==========================# +# Variable outputs +# ==========================# +outputClassification = PluginVariable( + name="Regression output", + id="out_zip", + description="The zip file to the output for the regression models", + type=VariableTypes.FILE, +) + +############################## +# Other variables # +############################## +trainingOutput = PluginVariable( + name="Training Output", + id="training_output", + description="The path where to save the models training results.", + type=VariableTypes.FOLDER, + defaultValue=None, +) +scalerVar = PluginVariable( + name="Scaler", + id="scaler", + description="Choose one of the scaler available in scikit-learn, defaults to zscore.", + type=VariableTypes.STRING_LIST, + allowedValues=["robust", "zscore", "minmax"], + defaultValue=None, +) +kfoldParameters = PluginVariable( + name="Kfold Parameters", + id="kfold_parameters", + description="The parameters for the kfold in num_split:test_size format ('5:0.2').", + type=VariableTypes.STRING, + defaultValue=None, +) +outliersVar = PluginVariable( + name="Outliers", + id="outliers", + description="A list of outliers if any, the name should be the same as in the excel file with the filtered features, you can also specify the path to a file in plain text format, each record should be in a new line", + type=VariableTypes.STRING, + defaultValue=None, +) +budgetTime = PluginVariable( + name="Budget Time", + id="budget_time", + description="The time budget for the training in minutes, should be > 0 or None.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +precisionWeight = PluginVariable( + name="Precision Weight", + id="precision_weight", + description="Weights to specify how relevant is the precision for the ranking of the different features.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +recallWeight = PluginVariable( + name="Recall Weight", + id="recall_weight", + description="Weights to specify how relevant is the recall for the ranking of the different features.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +reportWeight = PluginVariable( + name="Report Weight", + id="report_weight", + description="Weights to specify how relevant is the f1, precision and recall for the ranking of the different features with respect to MCC which is a more general measures of the performance of a model.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +differenceWeight = PluginVariable( + name="Difference Weight", + id="difference_weight", + description="How important is to have similar training and test metrics.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +bestModels = PluginVariable( + name="Best Models", + id="best_models", + description="The number of best models to select, it affects the analysis and the saved hyperparameters.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +seedVar = PluginVariable( + name="Seed", + id="seed", + description="The seed for the random state.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +dropVar = PluginVariable( + name="Drop", + id="drop", + description="The models to drop.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=[ + "lr", + "knn", + "nb", + "dt", + "svm", + "rbfsvm", + "gpc", + "mlp", + "ridge", + "rf", + "qda", + "ada", + "gbc", + "lda", + "et", + "xgboost", + "lightgbm", + "catboost", + "dummy", + ], +) +selectedVar = PluginVariable( + name="Selected", + id="selected", + description="The models to select.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=[ + "lr", + "knn", + "nb", + "dt", + "svm", + "rbfsvm", + "gpc", + "mlp", + "ridge", + "rf", + "qda", + "ada", + "gbc", + "lda", + "et", + "xgboost", + "lightgbm", + "catboost", + "dummy", + ], +) +tuneVar = PluginVariable( + name="Tune", + id="tune", + description="If to tune the best models.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +plotVar = PluginVariable( + name="Plot", + id="plot", + description="The plots to save.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"], +) +optimizeVar = PluginVariable( + name="Optimize", + id="optimize", + description="The metric to optimize for retuning the best models.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"], +) +sheetName = PluginVariable( + name="Sheet Name", + id="sheet_name", + description="The sheet name for the excel file if the training features is in excel format.", + type=VariableTypes.STRING, + defaultValue=None, +) +numIter = PluginVariable( + name="Number of Iterations", + id="num_iter", + description="The number of iterations for the hyperparameter search.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +splitStrategy = PluginVariable( + name="Split Strategy", + id="split_strategy", + description="The strategy to split the data.", + type=VariableTypes.STRING_LIST, + defaultValue=None, + allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"], +) +clusterVar = PluginVariable( + name="Cluster", + id="cluster", + description="The path to the cluster file generated by mmseqs2 or a custom group index file just like data/resultsDB_clu.tsv.", + type=VariableTypes.FILE, + defaultValue=None, +) +mutationsVar = PluginVariable( + name="Mutations", + id="mutations", + description="The column name of the mutations in the training data.", + type=VariableTypes.STRING, + defaultValue=None, +) +testNumMutations = PluginVariable( + name="Test Number of Mutations", + id="test_num_mutations", + description="The threshold of number of mutations to be included in the test set.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +greaterVar = PluginVariable( + name="Greater", + id="greater", + description="Include in the test set, mutations that are greater of less than the threshold, default greater.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +shuffleVar = PluginVariable( + name="Shuffle", + id="shuffle", + description="If to shuffle the data before splitting.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) +crossValidation = PluginVariable( + name="Cross Validation", + id="cross_validation", + description="If to use cross validation, default is True.", + type=VariableTypes.BOOLEAN, + defaultValue=None, +) + + +def runClassificationBioml(block: SlurmBlock): + + input_excel = block.inputs.get("input_excel", None) + if input_excel is None: + raise Exception("No input excel provided") + if not os.path.exists(input_excel): + raise Exception(f"The input excel file does not exist: {input_excel}") + + input_hyperparameters = block.inputs.get("input_hyperparameters", None) + if input_hyperparameters is None: + raise Exception("No input hyperparameters provided") + if not os.path.exists(input_hyperparameters): + raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") + + input_sheets = block.inputs.get("input_sheets", None) + if input_sheets is None: + raise Exception("No input sheets provided") + + input_label = block.inputs.get("input_label", None) + if input_label is None: + raise Exception("No input label provided") + if not os.path.exists(input_label): + raise Exception(f"The input label file does not exist: {input_label}") + + command = "python -m BioML.Ensemble " + command += f"--excel {input_excel} " + command += f"--hyperparameter_path {input_hyperparameters} " + command += f"--sheets {input_sheets} " + command += f"--label {input_label} " + + jobs = [command] + + folderName = block.variables.get("folder_name", "ensembleBioml") + block.extraData["folder_name"] = folderName + removeExisting = block.variables.get("remove_existing_results", False) + + # If folder already exists, raise exception + if removeExisting and os.path.exists(folderName): + os.system("rm -rf " + folderName) + + if not removeExisting and os.path.exists(folderName): + raise Exception( + "The folder {} already exists. Please, choose another name or remove it.".format( + folderName + ) + ) + + # Create an copy the inputs + os.makedirs(folderName, exist_ok=True) + os.system(f"cp {input_excel} {folderName}") + os.system(f"cp {input_hyperparameters} {folderName}") + os.system(f"cp {input_sheets} {folderName}") + os.system(f"cp {input_label} {folderName}") + + from utils import launchCalculationAction + + launchCalculationAction( + block, + jobs, + program="bioml", + uploadFolders=[ + folderName, + ], + ) + + +def finalAction(block: SlurmBlock): + pass + + +from utils import BSC_JOB_VARIABLES + +regressionBioMLBlock = SlurmBlock( + name="Regression BioMl", + initialAction=runClassificationBioml, + finalAction=finalAction, + description="Train Regression models.", + inputGroups=[fileGroup, stringGroup], + variables=BSC_JOB_VARIABLES + + [ + selectedVar, + dropVar, + trainingOutput, + scalerVar, + kfoldParameters, + outliersVar, + budgetTime, + precisionWeight, + recallWeight, + reportWeight, + differenceWeight, + bestModels, + seedVar, + tuneVar, + plotVar, + optimizeVar, + sheetName, + numIter, + splitStrategy, + clusterVar, + mutationsVar, + testNumMutations, + greaterVar, + shuffleVar, + crossValidation, + ], + outputs=[outputClassification], +) diff --git a/EAPM/Include/Blocks/outliersBioMl.py b/EAPM/Include/Blocks/outliersBioMl.py new file mode 100644 index 0000000..d402e4a --- /dev/null +++ b/EAPM/Include/Blocks/outliersBioMl.py @@ -0,0 +1,152 @@ +""" +A module that performs regression analysis on a dataset. +""" + +import os + +from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +excelFile = PluginVariable( + name="Excel file", + id="excel_file", + description="The file to where the selected features are saved in excel format.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["xlsx"], +) + + +# ==========================# +# Variable outputs +# ==========================# +outputOutliers = PluginVariable( + name="Outliers output", + id="out_zip", + description="The path to the output for the outliers.", + type=VariableTypes.FILE, +) + +############################## +# Other variables # +############################## +numThreads = PluginVariable( + name="Number of threads", + id="num_threads", + description="The number of threads to use.", + type=VariableTypes.INTEGER, + defaultValue=None, +) +scalerVar = PluginVariable( + name="Scaler", + id="scaler", + description="The scaler to use.", + type=VariableTypes.STRING, + defaultValue="StandardScaler", + allowedValues=["StandardScaler", "MinMaxScaler", "RobustScaler"], +) +contaminationVar = PluginVariable( + name="Contamination", + id="contamination", + description="The contamination value.", + type=VariableTypes.FLOAT, + defaultValue=None, +) +numFeatures = PluginVariable( + name="Number of features", + id="num_features", + description="The number of features to use.", + type=VariableTypes.INTEGER, + defaultValue=None, +) + + +def runClassificationBioml(block: SlurmBlock): + + input_excel = block.inputs.get("input_excel", None) + if input_excel is None: + raise Exception("No input excel provided") + if not os.path.exists(input_excel): + raise Exception(f"The input excel file does not exist: {input_excel}") + + input_hyperparameters = block.inputs.get("input_hyperparameters", None) + if input_hyperparameters is None: + raise Exception("No input hyperparameters provided") + if not os.path.exists(input_hyperparameters): + raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") + + input_sheets = block.inputs.get("input_sheets", None) + if input_sheets is None: + raise Exception("No input sheets provided") + + input_label = block.inputs.get("input_label", None) + if input_label is None: + raise Exception("No input label provided") + if not os.path.exists(input_label): + raise Exception(f"The input label file does not exist: {input_label}") + + command = "python -m BioML.Ensemble " + command += f"--excel {input_excel} " + command += f"--hyperparameter_path {input_hyperparameters} " + command += f"--sheets {input_sheets} " + command += f"--label {input_label} " + + jobs = [command] + + folderName = block.variables.get("folder_name", "ensembleBioml") + block.extraData["folder_name"] = folderName + removeExisting = block.variables.get("remove_existing_results", False) + + # If folder already exists, raise exception + if removeExisting and os.path.exists(folderName): + os.system("rm -rf " + folderName) + + if not removeExisting and os.path.exists(folderName): + raise Exception( + "The folder {} already exists. Please, choose another name or remove it.".format( + folderName + ) + ) + + # Create an copy the inputs + os.makedirs(folderName, exist_ok=True) + os.system(f"cp {input_excel} {folderName}") + os.system(f"cp {input_hyperparameters} {folderName}") + os.system(f"cp {input_sheets} {folderName}") + os.system(f"cp {input_label} {folderName}") + + from utils import launchCalculationAction + + launchCalculationAction( + block, + jobs, + program="bioml", + uploadFolders=[ + folderName, + ], + ) + + +def finalAction(block: SlurmBlock): + pass + + +from utils import BSC_JOB_VARIABLES + +outliersBioMLBlock = SlurmBlock( + name="Regression BioMl", + initialAction=runClassificationBioml, + finalAction=finalAction, + description="Train Regression models.", + inputs=[excelFile], + variables=BSC_JOB_VARIABLES + + [ + numThreads, + scalerVar, + contaminationVar, + numFeatures, + ], + outputs=[outputOutliers], +) diff --git a/EAPM/Include/Blocks/splitMethodsBioMl.py b/EAPM/Include/Blocks/splitMethodsBioMl.py new file mode 100644 index 0000000..e69de29 From ae29e2174c237bea0471dff72f8cc6ac160ad147 Mon Sep 17 00:00:00 2001 From: AlbertCS Date: Thu, 23 May 2024 11:03:50 +0200 Subject: [PATCH 4/6] Update to use the new MN5, and working on glide docking --- EAPM/EAPM.py | 16 +- EAPM/Include/Blocks/Ahatool.py | 9 +- EAPM/Include/Blocks/AlphaFoldEAPM.py | 12 +- EAPM/Include/Blocks/AnalyseGlideGPX.py | 167 +++++++++ EAPM/Include/Blocks/ClassificationBioMl.py | 395 -------------------- EAPM/Include/Blocks/ConservedResiduesMSA.py | 6 +- EAPM/Include/Blocks/EpPred.py | 8 + EAPM/Include/Blocks/PDBToMAE.py | 11 +- EAPM/Include/Blocks/PredictBioML.py | 360 ------------------ EAPM/Include/Blocks/PrepWizardEAPM.py | 2 +- EAPM/Include/Blocks/RegressionBioMl.py | 391 ------------------- EAPM/Include/Blocks/SetupDockingGrid.py | 2 +- EAPM/Include/Blocks/SetupGlide.py | 2 +- EAPM/Include/Blocks/TrimAlphafoldModels.py | 4 +- EAPM/Include/Blocks/analyseGlideFelip.py | 104 ++++++ EAPM/Include/Blocks/outliersBioMl.py | 152 -------- EAPM/Include/Blocks/splitMethodsBioMl.py | 0 EAPM/Include/Blocks/testBlock.py | 75 ++++ EAPM/Include/Configs/hmmerConfig.py | 2 +- EAPM/Include/Configs/mafftConfig.py | 2 +- EAPM/Include/utils.py | 73 +++- EAPM/config/eapm.json | 2 +- EAPM/plugin.meta | 18 +- EAPM/preinst.sh | 3 + 24 files changed, 469 insertions(+), 1347 deletions(-) create mode 100644 EAPM/Include/Blocks/AnalyseGlideGPX.py delete mode 100644 EAPM/Include/Blocks/ClassificationBioMl.py delete mode 100644 EAPM/Include/Blocks/PredictBioML.py delete mode 100644 EAPM/Include/Blocks/RegressionBioMl.py create mode 100644 EAPM/Include/Blocks/analyseGlideFelip.py delete mode 100644 EAPM/Include/Blocks/outliersBioMl.py delete mode 100644 EAPM/Include/Blocks/splitMethodsBioMl.py create mode 100644 EAPM/Include/Blocks/testBlock.py create mode 100644 EAPM/preinst.sh diff --git a/EAPM/EAPM.py b/EAPM/EAPM.py index 4f0a314..a7ad8f3 100644 --- a/EAPM/EAPM.py +++ b/EAPM/EAPM.py @@ -95,21 +95,13 @@ def createPlugin(): eapmPlugin.addBlock(epPredBlock) - from Blocks.ClassificationBioMl import classificationBioMLBlock # type: ignore + from Blocks.testBlock import testBlock # type: ignore - eapmPlugin.addBlock(classificationBioMLBlock) + eapmPlugin.addBlock(testBlock) - from Blocks.RegressionBioMl import regressionBioMLBlock # type: ignore + from Blocks.AnalyseGlideGPX import AnalyseGPXBlock # type: ignore - eapmPlugin.addBlock(regressionBioMLBlock) - - from Blocks.PredictBioML import PredictBioMLBlock # type: ignore - - eapmPlugin.addBlock(PredictBioMLBlock) - - from Blocks.outliersBioMl import outliersBioMLBlock # type: ignore - - eapmPlugin.addBlock(outliersBioMLBlock) + eapmPlugin.addBlock(AnalyseGPXBlock) # Add the configs from Configs.mafftConfig import mafftExecutableConfig # type: ignore diff --git a/EAPM/Include/Blocks/Ahatool.py b/EAPM/Include/Blocks/Ahatool.py index a7166ce..1e6e64b 100644 --- a/EAPM/Include/Blocks/Ahatool.py +++ b/EAPM/Include/Blocks/Ahatool.py @@ -46,6 +46,13 @@ ############################## # Other variables # ############################## +removeExistingResults = PluginVariable( + name="Remove existing results", + id="remove_existing_results", + description="Remove existing results", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) prefixVar = PluginVariable( name="Prefix", id="prefix", @@ -147,6 +154,6 @@ def initialAction(block: PluginBlock): action=initialAction, description="Iteratively search a protein sequence against a protein database", inputs=[inputFasta, dbPath, containerName], - variables=[prefixVar, startVar, evalVar, threadsVar], + variables=[removeExistingResults, prefixVar, startVar, evalVar, threadsVar], outputs=[outputAhatool], ) diff --git a/EAPM/Include/Blocks/AlphaFoldEAPM.py b/EAPM/Include/Blocks/AlphaFoldEAPM.py index 8788b06..815c9d8 100644 --- a/EAPM/Include/Blocks/AlphaFoldEAPM.py +++ b/EAPM/Include/Blocks/AlphaFoldEAPM.py @@ -62,6 +62,16 @@ def initialAlphafold(block: SlurmBlock): folderName = block.variables.get("folder_name", "alphafold") removeExisting = block.variables.get("remove_existing_results", False) + cpus_per_task = block.variables.get("cpus_per_task") + if cpus_per_task is 1: + print("Alphafold requires at least 20 cpus per task. Changing to 20 cpus per task.") + block.variables["cpus_per_task"] = 20 + + partiton = block.variables.get("partition") + if partiton is None: + print("Alphafold requires an accelerated partition. Changing to acc_bscls.") + block.variables["partition"] = "acc_bscls" + # If folder already exists, raise exception if removeExisting and os.path.exists(folderName): os.system("rm -rf " + folderName) @@ -106,7 +116,7 @@ def finalAlhafoldAction(block: SlurmBlock): alphafoldBlock = SlurmBlock( name="Alphafold", - description="Run Alphafold. (For cte_power, marenostrum, nord3 and minotauro clusters or local)", + description="Run Alphafold. (For marenostrum, nord3 clusters or local)", initialAction=initialAlphafold, finalAction=finalAlhafoldAction, variables=BSC_JOB_VARIABLES + [outputAF, removeExistingResults], diff --git a/EAPM/Include/Blocks/AnalyseGlideGPX.py b/EAPM/Include/Blocks/AnalyseGlideGPX.py new file mode 100644 index 0000000..425a48a --- /dev/null +++ b/EAPM/Include/Blocks/AnalyseGlideGPX.py @@ -0,0 +1,167 @@ +import json +import os +import shutil + +import bsc_calculations +import pandas as pd +import prepare_proteins + +from HorusAPI import PluginBlock, PluginVariable, VariableGroup, VariableTypes + +# TODO Configure the inputs correctly + +# ==========================# +# Variable inputs +# ==========================# +glideOutputVariable = PluginVariable( + id="glide_output", + name="Glide output", + description="Glide output from the BSC calculations block", + type=VariableTypes.CUSTOM, + allowedValues=["bsc_results"], +) +conservedResidues = PluginVariable( + name="Conserved residues", + id="conserved_indexes", + description="The conserved residues", + type=VariableTypes.CUSTOM, + defaultValue=None, +) +residueProtein = PluginVariable( + name="Atom Protein", id="resi_id1", description="atom1", type=VariableTypes.ATOM +) +residueLigand = PluginVariable( + name="Atom Ligand", id="resi_id2", description="atom2", type=VariableTypes.ATOM +) +resNameProt = PluginVariable( + name="Protein residue name", + id="res_name_prot", + description="The protein residue name", + type=VariableTypes.STRING, + defaultValue="CYS", +) +resNameLig = PluginVariable( + name="Ligand residue name", + id="res_name_lig", + description="The ligand residue name", + type=VariableTypes.STRING, + defaultValue="SG", +) +ligandName = PluginVariable( + name="Ligand name", + id="ligand_name", + description="The ligand name", + type=VariableTypes.STRING, + defaultValue="GSH", +) + +stringGroup = VariableGroup( + id="string_input", + name="Input String", + description="The input are in string", + variables=[conservedResidues, glideOutputVariable, resNameProt, resNameLig, ligandName], +) +atomGroup = VariableGroup( + id="atom_input", + name="Input Atom", + description="The input are in atom", + variables=[conservedResidues, glideOutputVariable, residueProtein, residueLigand], +) + +# Output variables +outputModelsVariable = PluginVariable( + id="models", + name="Alphafold models", + description="The output models", + type=VariableTypes.FOLDER, +) + + +# ==========================# +# Variable +# ==========================# +metricsVar = PluginVariable( + name="Metrics ", + id="metrics", + description="The metrics list", + type=VariableTypes.STRING, + defaultValue="SG_S", +) + + +def finalAction(block: PluginBlock): + + bsc_result = block.inputs.get(glideOutputVariable.id, None) + folder_to_analyse = bsc_result["dock_folder"] + model_folder = bsc_result["model_folder"] + + conserved_indexes = block.inputs.get(conservedResidues.id, None) + + metrics = block.variables.get("metrics", "SG_S") + + if block.selectedInputGroup == stringGroup.id: + res_name_prot = block.inputs.get(resNameProt.id, "CYS") + res_name_lig = block.inputs.get(resNameLig.id, "SG") + ligand_name = block.inputs.get(ligandName.id, "GSH") + else: + residue_protein = block.inputs.get(residueProtein.id, None) + residue_ligand = block.inputs.get(residueLigand.id, None) + + models = prepare_proteins.proteinModels(model_folder) + + center_atom = {} # Create dictionary to store the atom 3-element tuple for each model + for model in models: # Iterate the models inside the library + # Iterate the residues for each Bio.PDB.Structure object + for r in models.structures[model].get_residues(): + # Check that the residue matches the defined index + aa = conserved_indexes[model] + # for cons_ind in conserved_indexes[model]: + if r.id[1] in conserved_indexes[model]: + # Assert that the residue has the correct residue identity + if r.resname == res_name_prot: + # Store the corresponsing tuple. + center_atom[model] = (r.get_parent().id, r.id[1], res_name_lig) + break + + print(f"center_atom: {center_atom}") + + atom_pairs = {} # Define the dictionary containing the atom pairs for each model + for model in models: + atom_pairs[model] = {} + for ligand in [ligand_name]: + atom_pairs[model][ligand] = [] + atom_pairs[model][ligand].append((center_atom[model], "S1")) + + print(f"Atom pairs: {atom_pairs}") + + models.analyseDocking(folder_to_analyse, atom_pairs=atom_pairs) + + metric_distances = {} # Define the global dictionary + metric_distances[metrics] = {} # Define the metric nested dictionary + for model in models: + metric_distances[metrics][model] = {} # Define the model nested dictionary + for ligand in models.docking_ligands[model]: + # Define the ligand nested dictionary with all the docking distances list + metric_distances[metrics][model][ligand] = models.getDockingDistances(model, ligand) + + print(f"metric_distances: {metric_distances}") + + models.combineDockingDistancesIntoMetrics(metric_distances) + + print(f"models.docking_data: {models.docking_data}") + + best_poses = models.getBestDockingPosesIteratively(metric_distances) + + models.extractDockingPoses(best_poses, folder_to_analyse, "best_docking_poses", separator="@") + + block.setOutput(outputModelsVariable.id, "best_docking_poses") + + +AnalyseGPXBlock = PluginBlock( + name="Analyse Glide GPX", + description="To analyse Glide GPX results", + action=finalAction, + variables=[metricsVar], + inputGroups=[stringGroup, atomGroup], + outputs=[outputModelsVariable], +) diff --git a/EAPM/Include/Blocks/ClassificationBioMl.py b/EAPM/Include/Blocks/ClassificationBioMl.py deleted file mode 100644 index 3300f9f..0000000 --- a/EAPM/Include/Blocks/ClassificationBioMl.py +++ /dev/null @@ -1,395 +0,0 @@ -""" -Bioml Classification - | Wrapper class for the bioml Classification module. - | Train classification models. -""" - -# TODO Add to the documentation - -import os - -from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes - -# ==========================# -# Variable inputs -# ==========================# -inputLabelFile = PluginVariable( - name="Input Label", - id="input_label", - description="The path to the labels of the training set in a csv format or string if it is inside training features.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["csv"], -) -inputLabelString = PluginVariable( - name="Input Label", - id="input_label", - description="The labels of the training set in a string format.", - type=VariableTypes.STRING, - defaultValue=None, -) -trainingFeatures = PluginVariable( - name="Training Features", - id="training_features", - description="The file to where the training features are saved in excel or csv format.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["csv", "xlsx"], -) -fileGroup = VariableGroup( - id="fileType_input", - name="Input File", - description="The input is a file", - variables=[inputLabelFile, trainingFeatures], -) -stringGroup = VariableGroup( - id="stringType_input", - name="Input String", - description="The input is a string", - variables=[inputLabelString, trainingFeatures], -) - -# ==========================# -# Variable outputs -# ==========================# -outputClassification = PluginVariable( - name="Classification output", - id="out_zip", - description="The zip file to the output for the classification models", - type=VariableTypes.FILE, -) - -############################## -# Other variables # -############################## -trainingOutput = PluginVariable( - name="Training Output", - id="training_output", - description="The path where to save the models training results.", - type=VariableTypes.FOLDER, - defaultValue=None, -) -scalerVar = PluginVariable( - name="Scaler", - id="scaler", - description="Choose one of the scaler available in scikit-learn, defaults to zscore.", - type=VariableTypes.STRING_LIST, - allowedValues=["robust", "zscore", "minmax"], - defaultValue=None, -) -kfoldParameters = PluginVariable( - name="Kfold Parameters", - id="kfold_parameters", - description="The parameters for the kfold in num_split:test_size format ('5:0.2').", - type=VariableTypes.STRING, - defaultValue=None, -) -outliersVar = PluginVariable( - name="Outliers", - id="outliers", - description="A list of outliers if any, the name should be the same as in the excel file with the filtered features, you can also specify the path to a file in plain text format, each record should be in a new line", - type=VariableTypes.STRING, - defaultValue=None, -) -budgetTime = PluginVariable( - name="Budget Time", - id="budget_time", - description="The time budget for the training in minutes, should be > 0 or None.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -precisionWeight = PluginVariable( - name="Precision Weight", - id="precision_weight", - description="Weights to specify how relevant is the precision for the ranking of the different features.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -recallWeight = PluginVariable( - name="Recall Weight", - id="recall_weight", - description="Weights to specify how relevant is the recall for the ranking of the different features.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -reportWeight = PluginVariable( - name="Report Weight", - id="report_weight", - description="Weights to specify how relevant is the f1, precision and recall for the ranking of the different features with respect to MCC which is a more general measures of the performance of a model.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -differenceWeight = PluginVariable( - name="Difference Weight", - id="difference_weight", - description="How important is to have similar training and test metrics.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -bestModels = PluginVariable( - name="Best Models", - id="best_models", - description="The number of best models to select, it affects the analysis and the saved hyperparameters.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -seedVar = PluginVariable( - name="Seed", - id="seed", - description="The seed for the random state.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -dropVar = PluginVariable( - name="Drop", - id="drop", - description="The models to drop.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=[ - "lr", - "knn", - "nb", - "dt", - "svm", - "rbfsvm", - "gpc", - "mlp", - "ridge", - "rf", - "qda", - "ada", - "gbc", - "lda", - "et", - "xgboost", - "lightgbm", - "catboost", - "dummy", - ], -) -selectedVar = PluginVariable( - name="Selected", - id="selected", - description="The models to select.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=[ - "lr", - "knn", - "nb", - "dt", - "svm", - "rbfsvm", - "gpc", - "mlp", - "ridge", - "rf", - "qda", - "ada", - "gbc", - "lda", - "et", - "xgboost", - "lightgbm", - "catboost", - "dummy", - ], -) -tuneVar = PluginVariable( - name="Tune", - id="tune", - description="If to tune the best models.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -plotVar = PluginVariable( - name="Plot", - id="plot", - description="The plots to save.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"], -) -optimizeVar = PluginVariable( - name="Optimize", - id="optimize", - description="The metric to optimize for retuning the best models.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"], -) -sheetName = PluginVariable( - name="Sheet Name", - id="sheet_name", - description="The sheet name for the excel file if the training features is in excel format.", - type=VariableTypes.STRING, - defaultValue=None, -) -numIter = PluginVariable( - name="Number of Iterations", - id="num_iter", - description="The number of iterations for the hyperparameter search.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -splitStrategy = PluginVariable( - name="Split Strategy", - id="split_strategy", - description="The strategy to split the data.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"], -) -clusterVar = PluginVariable( - name="Cluster", - id="cluster", - description="The path to the cluster file generated by mmseqs2 or a custom group index file just like data/resultsDB_clu.tsv.", - type=VariableTypes.FILE, - defaultValue=None, -) -mutationsVar = PluginVariable( - name="Mutations", - id="mutations", - description="The column name of the mutations in the training data.", - type=VariableTypes.STRING, - defaultValue=None, -) -testNumMutations = PluginVariable( - name="Test Number of Mutations", - id="test_num_mutations", - description="The threshold of number of mutations to be included in the test set.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -greaterVar = PluginVariable( - name="Greater", - id="greater", - description="Include in the test set, mutations that are greater of less than the threshold, default greater.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -shuffleVar = PluginVariable( - name="Shuffle", - id="shuffle", - description="If to shuffle the data before splitting.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -crossValidation = PluginVariable( - name="Cross Validation", - id="cross_validation", - description="If to use cross validation, default is True.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) - - -def runClassificationBioml(block: SlurmBlock): - - input_excel = block.inputs.get("input_excel", None) - if input_excel is None: - raise Exception("No input excel provided") - if not os.path.exists(input_excel): - raise Exception(f"The input excel file does not exist: {input_excel}") - - input_hyperparameters = block.inputs.get("input_hyperparameters", None) - if input_hyperparameters is None: - raise Exception("No input hyperparameters provided") - if not os.path.exists(input_hyperparameters): - raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") - - input_sheets = block.inputs.get("input_sheets", None) - if input_sheets is None: - raise Exception("No input sheets provided") - - input_label = block.inputs.get("input_label", None) - if input_label is None: - raise Exception("No input label provided") - if not os.path.exists(input_label): - raise Exception(f"The input label file does not exist: {input_label}") - - command = "python -m BioML.Ensemble " - command += f"--excel {input_excel} " - command += f"--hyperparameter_path {input_hyperparameters} " - command += f"--sheets {input_sheets} " - command += f"--label {input_label} " - - jobs = [command] - - folderName = block.variables.get("folder_name", "ensembleBioml") - block.extraData["folder_name"] = folderName - removeExisting = block.variables.get("remove_existing_results", False) - - # If folder already exists, raise exception - if removeExisting and os.path.exists(folderName): - os.system("rm -rf " + folderName) - - if not removeExisting and os.path.exists(folderName): - raise Exception( - "The folder {} already exists. Please, choose another name or remove it.".format( - folderName - ) - ) - - # Create an copy the inputs - os.makedirs(folderName, exist_ok=True) - os.system(f"cp {input_excel} {folderName}") - os.system(f"cp {input_hyperparameters} {folderName}") - os.system(f"cp {input_sheets} {folderName}") - os.system(f"cp {input_label} {folderName}") - - from utils import launchCalculationAction - - launchCalculationAction( - block, - jobs, - program="bioml", - uploadFolders=[ - folderName, - ], - ) - - -def finalAction(block: SlurmBlock): - pass - - -from utils import BSC_JOB_VARIABLES - -classificationBioMLBlock = SlurmBlock( - name="Classification BioML", - initialAction=runClassificationBioml, - finalAction=finalAction, - description="Train classification models.", - inputGroups=[fileGroup, stringGroup], - variables=BSC_JOB_VARIABLES - + [ - selectedVar, - dropVar, - trainingOutput, - scalerVar, - kfoldParameters, - outliersVar, - budgetTime, - precisionWeight, - recallWeight, - reportWeight, - differenceWeight, - bestModels, - seedVar, - tuneVar, - plotVar, - optimizeVar, - sheetName, - numIter, - splitStrategy, - clusterVar, - mutationsVar, - testNumMutations, - greaterVar, - shuffleVar, - crossValidation, - ], - outputs=[outputClassification], -) diff --git a/EAPM/Include/Blocks/ConservedResiduesMSA.py b/EAPM/Include/Blocks/ConservedResiduesMSA.py index 4b7b391..bee7ae6 100644 --- a/EAPM/Include/Blocks/ConservedResiduesMSA.py +++ b/EAPM/Include/Blocks/ConservedResiduesMSA.py @@ -1,4 +1,4 @@ -from HorusAPI import PluginBlock, PluginVariable, VariableTypes, Extensions +from HorusAPI import Extensions, PluginBlock, PluginVariable, VariableTypes proteinFolderVariable = PluginVariable( id="protein_folder", @@ -27,11 +27,11 @@ def getConservedMSAPositions(block: PluginBlock): proteinFolder = block.inputs.get("protein_folder", "proteins") - import prepare_proteins - # Check that there is at least one pdb file in the folder import os + import prepare_proteins + hasPDB = False for file in os.listdir(proteinFolder): if file.endswith(".pdb"): diff --git a/EAPM/Include/Blocks/EpPred.py b/EAPM/Include/Blocks/EpPred.py index f30ef04..1d385a2 100644 --- a/EAPM/Include/Blocks/EpPred.py +++ b/EAPM/Include/Blocks/EpPred.py @@ -33,6 +33,13 @@ ############################## # Other variables # ############################## +removeExistingResults = PluginVariable( + name="Remove existing results", + id="remove_existing_results", + description="Remove existing results", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) pssmDir = PluginVariable( name="PSSM directory", id="pssm_dir", @@ -315,6 +322,7 @@ def finalAction(block: SlurmBlock): inputs=[inputFasta], variables=BSC_JOB_VARIABLES + [ + removeExistingResults, pssmDir, fastadir, ifeatureDir, diff --git a/EAPM/Include/Blocks/PDBToMAE.py b/EAPM/Include/Blocks/PDBToMAE.py index 288e6eb..e033459 100644 --- a/EAPM/Include/Blocks/PDBToMAE.py +++ b/EAPM/Include/Blocks/PDBToMAE.py @@ -1,7 +1,7 @@ import os import shutil -from HorusAPI import PluginBlock, VariableTypes, PluginVariable, VariableGroup +from HorusAPI import PluginBlock, PluginVariable, VariableGroup, VariableTypes # Input variables pdbFolderVariable = PluginVariable( @@ -48,12 +48,13 @@ def convertPDBToMAE(block: PluginBlock): # Test if we have valid glide installation command = "echo $SCHRODINGER" output = block.remote.remoteCommand(command) + if output is None or output == "": raise Exception(f"No valid Schrodinger installation found on remote {block.remote.name}") else: print(f"Schrodinger installation found on remote {block.remote.name}: {output}") - run_command = output + "/run" + run_command = str(output) + "/run" import prepare_proteins @@ -145,6 +146,12 @@ def mockSystem(command): if model.endswith(".mae"): os.rename(os.path.join(pdb_folder, model), os.path.join(mae_folder, model)) + elif block.remote.name == "Local": + for model in os.listdir(pdb_folder): + if model.endswith(".mae"): + # Move the MAE files to the output folder + shutil.move(os.path.join(pdb_folder, model), os.path.join(mae_folder, model)) + print( f"Sucessfully converted PDB files to MAE. Files converted: {len(os.listdir(mae_folder))}" ) diff --git a/EAPM/Include/Blocks/PredictBioML.py b/EAPM/Include/Blocks/PredictBioML.py deleted file mode 100644 index 1be5f3c..0000000 --- a/EAPM/Include/Blocks/PredictBioML.py +++ /dev/null @@ -1,360 +0,0 @@ -""" -A module that performs regression analysis on a dataset. -""" - -import os - -from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes - -# ==========================# -# Variable inputs -# ==========================# -fastaFile = PluginVariable( - name="Fasta file", - id="fasta_file", - description="The fasta file path.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["fasta"], -) -modelPath = PluginVariable( - name="Model Path", - id="model_path", - description="The path to the model.", - type=VariableTypes.FILE, - defaultValue=None, -) -testFeatures = PluginVariable( - name="Test Features", - id="test_features", - description="The file to where the test features are saved in excel or csv format.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["csv", "xlsx"], -) -trainingFeatures = PluginVariable( - name="Training Features", - id="training_features", - description="The file to where the training features are saved in excel or csv format.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["csv", "xlsx"], -) - -# ==========================# -# Variable outputs -# ==========================# -outputPrediction = PluginVariable( - name="Prediction output", - id="out_zip", - description="The zip file to the output for the prediction results", - type=VariableTypes.FILE, -) - -############################## -# Other variables # -############################## -trainingOutput = PluginVariable( - name="Training Output", - id="training_output", - description="The path where to save the models training results.", - type=VariableTypes.FOLDER, - defaultValue=None, -) -scalerVar = PluginVariable( - name="Scaler", - id="scaler", - description="Choose one of the scaler available in scikit-learn, defaults to zscore.", - type=VariableTypes.STRING_LIST, - allowedValues=["robust", "zscore", "minmax"], - defaultValue=None, -) -kfoldParameters = PluginVariable( - name="Kfold Parameters", - id="kfold_parameters", - description="The parameters for the kfold in num_split:test_size format ('5:0.2').", - type=VariableTypes.STRING, - defaultValue=None, -) -outliersVar = PluginVariable( - name="Outliers", - id="outliers", - description="A list of outliers if any, the name should be the same as in the excel file with the filtered features, you can also specify the path to a file in plain text format, each record should be in a new line", - type=VariableTypes.STRING, - defaultValue=None, -) -budgetTime = PluginVariable( - name="Budget Time", - id="budget_time", - description="The time budget for the training in minutes, should be > 0 or None.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -precisionWeight = PluginVariable( - name="Precision Weight", - id="precision_weight", - description="Weights to specify how relevant is the precision for the ranking of the different features.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -recallWeight = PluginVariable( - name="Recall Weight", - id="recall_weight", - description="Weights to specify how relevant is the recall for the ranking of the different features.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -reportWeight = PluginVariable( - name="Report Weight", - id="report_weight", - description="Weights to specify how relevant is the f1, precision and recall for the ranking of the different features with respect to MCC which is a more general measures of the performance of a model.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -differenceWeight = PluginVariable( - name="Difference Weight", - id="difference_weight", - description="How important is to have similar training and test metrics.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -bestModels = PluginVariable( - name="Best Models", - id="best_models", - description="The number of best models to select, it affects the analysis and the saved hyperparameters.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -seedVar = PluginVariable( - name="Seed", - id="seed", - description="The seed for the random state.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -dropVar = PluginVariable( - name="Drop", - id="drop", - description="The models to drop.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=[ - "lr", - "knn", - "nb", - "dt", - "svm", - "rbfsvm", - "gpc", - "mlp", - "ridge", - "rf", - "qda", - "ada", - "gbc", - "lda", - "et", - "xgboost", - "lightgbm", - "catboost", - "dummy", - ], -) -selectedVar = PluginVariable( - name="Selected", - id="selected", - description="The models to select.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=[ - "lr", - "knn", - "nb", - "dt", - "svm", - "rbfsvm", - "gpc", - "mlp", - "ridge", - "rf", - "qda", - "ada", - "gbc", - "lda", - "et", - "xgboost", - "lightgbm", - "catboost", - "dummy", - ], -) -tuneVar = PluginVariable( - name="Tune", - id="tune", - description="If to tune the best models.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -plotVar = PluginVariable( - name="Plot", - id="plot", - description="The plots to save.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"], -) -optimizeVar = PluginVariable( - name="Optimize", - id="optimize", - description="The metric to optimize for retuning the best models.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"], -) -sheetName = PluginVariable( - name="Sheet Name", - id="sheet_name", - description="The sheet name for the excel file if the training features is in excel format.", - type=VariableTypes.STRING, - defaultValue=None, -) -numIter = PluginVariable( - name="Number of Iterations", - id="num_iter", - description="The number of iterations for the hyperparameter search.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -splitStrategy = PluginVariable( - name="Split Strategy", - id="split_strategy", - description="The strategy to split the data.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"], -) -clusterVar = PluginVariable( - name="Cluster", - id="cluster", - description="The path to the cluster file generated by mmseqs2 or a custom group index file just like data/resultsDB_clu.tsv.", - type=VariableTypes.FILE, - defaultValue=None, -) -mutationsVar = PluginVariable( - name="Mutations", - id="mutations", - description="The column name of the mutations in the training data.", - type=VariableTypes.STRING, - defaultValue=None, -) -testNumMutations = PluginVariable( - name="Test Number of Mutations", - id="test_num_mutations", - description="The threshold of number of mutations to be included in the test set.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -greaterVar = PluginVariable( - name="Greater", - id="greater", - description="Include in the test set, mutations that are greater of less than the threshold, default greater.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -shuffleVar = PluginVariable( - name="Shuffle", - id="shuffle", - description="If to shuffle the data before splitting.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -crossValidation = PluginVariable( - name="Cross Validation", - id="cross_validation", - description="If to use cross validation, default is True.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) - - -def runClassificationBioml(block: SlurmBlock): - - input_excel = block.inputs.get("input_excel", None) - if input_excel is None: - raise Exception("No input excel provided") - if not os.path.exists(input_excel): - raise Exception(f"The input excel file does not exist: {input_excel}") - - input_hyperparameters = block.inputs.get("input_hyperparameters", None) - if input_hyperparameters is None: - raise Exception("No input hyperparameters provided") - if not os.path.exists(input_hyperparameters): - raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") - - input_sheets = block.inputs.get("input_sheets", None) - if input_sheets is None: - raise Exception("No input sheets provided") - - input_label = block.inputs.get("input_label", None) - if input_label is None: - raise Exception("No input label provided") - if not os.path.exists(input_label): - raise Exception(f"The input label file does not exist: {input_label}") - - command = "python -m BioML.Ensemble " - command += f"--excel {input_excel} " - command += f"--hyperparameter_path {input_hyperparameters} " - command += f"--sheets {input_sheets} " - command += f"--label {input_label} " - - jobs = [command] - - folderName = block.variables.get("folder_name", "ensembleBioml") - block.extraData["folder_name"] = folderName - removeExisting = block.variables.get("remove_existing_results", False) - - # If folder already exists, raise exception - if removeExisting and os.path.exists(folderName): - os.system("rm -rf " + folderName) - - if not removeExisting and os.path.exists(folderName): - raise Exception( - "The folder {} already exists. Please, choose another name or remove it.".format( - folderName - ) - ) - - # Create an copy the inputs - os.makedirs(folderName, exist_ok=True) - os.system(f"cp {input_excel} {folderName}") - os.system(f"cp {input_hyperparameters} {folderName}") - os.system(f"cp {input_sheets} {folderName}") - os.system(f"cp {input_label} {folderName}") - - from utils import launchCalculationAction - - launchCalculationAction( - block, - jobs, - program="bioml", - uploadFolders=[ - folderName, - ], - ) - - -def finalAction(block: SlurmBlock): - pass - - -from utils import BSC_JOB_VARIABLES - -PredictBioMLBlock = SlurmBlock( - name="Predict BioMl", - initialAction=runClassificationBioml, - finalAction=finalAction, - description="Predict using the models and average the votations.", - inputs=[fastaFile, modelPath, testFeatures, trainingFeatures], - variables=BSC_JOB_VARIABLES + [], - outputs=[outputPrediction], -) diff --git a/EAPM/Include/Blocks/PrepWizardEAPM.py b/EAPM/Include/Blocks/PrepWizardEAPM.py index 983973f..08e903a 100644 --- a/EAPM/Include/Blocks/PrepWizardEAPM.py +++ b/EAPM/Include/Blocks/PrepWizardEAPM.py @@ -177,7 +177,7 @@ def downloadPrepWizardResults(block: SlurmBlock): downloadResultsAction(block) - folderName = block.variables.get("folder_name") + folderName = block.variables.get("folder_name", "prepared_proteins") # Create the output folder containing the prepared proteins if not os.path.exists(folderName): diff --git a/EAPM/Include/Blocks/RegressionBioMl.py b/EAPM/Include/Blocks/RegressionBioMl.py deleted file mode 100644 index 0da2825..0000000 --- a/EAPM/Include/Blocks/RegressionBioMl.py +++ /dev/null @@ -1,391 +0,0 @@ -""" -A module that performs regression analysis on a dataset. -""" - -import os - -from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes - -# ==========================# -# Variable inputs -# ==========================# -inputLabelFile = PluginVariable( - name="Input Label", - id="input_label", - description="The path to the labels of the training set in a csv format or string if it is inside training features.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["csv"], -) -inputLabelString = PluginVariable( - name="Input Label", - id="input_label", - description="The labels of the training set in a string format.", - type=VariableTypes.STRING, - defaultValue=None, -) -trainingFeatures = PluginVariable( - name="Training Features", - id="training_features", - description="The file to where the training features are saved in excel or csv format.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["csv", "xlsx"], -) -fileGroup = VariableGroup( - id="fileType_input", - name="Input File", - description="The input is a file", - variables=[inputLabelFile, trainingFeatures], -) -stringGroup = VariableGroup( - id="stringType_input", - name="Input String", - description="The input is a string", - variables=[inputLabelString, trainingFeatures], -) - -# ==========================# -# Variable outputs -# ==========================# -outputClassification = PluginVariable( - name="Regression output", - id="out_zip", - description="The zip file to the output for the regression models", - type=VariableTypes.FILE, -) - -############################## -# Other variables # -############################## -trainingOutput = PluginVariable( - name="Training Output", - id="training_output", - description="The path where to save the models training results.", - type=VariableTypes.FOLDER, - defaultValue=None, -) -scalerVar = PluginVariable( - name="Scaler", - id="scaler", - description="Choose one of the scaler available in scikit-learn, defaults to zscore.", - type=VariableTypes.STRING_LIST, - allowedValues=["robust", "zscore", "minmax"], - defaultValue=None, -) -kfoldParameters = PluginVariable( - name="Kfold Parameters", - id="kfold_parameters", - description="The parameters for the kfold in num_split:test_size format ('5:0.2').", - type=VariableTypes.STRING, - defaultValue=None, -) -outliersVar = PluginVariable( - name="Outliers", - id="outliers", - description="A list of outliers if any, the name should be the same as in the excel file with the filtered features, you can also specify the path to a file in plain text format, each record should be in a new line", - type=VariableTypes.STRING, - defaultValue=None, -) -budgetTime = PluginVariable( - name="Budget Time", - id="budget_time", - description="The time budget for the training in minutes, should be > 0 or None.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -precisionWeight = PluginVariable( - name="Precision Weight", - id="precision_weight", - description="Weights to specify how relevant is the precision for the ranking of the different features.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -recallWeight = PluginVariable( - name="Recall Weight", - id="recall_weight", - description="Weights to specify how relevant is the recall for the ranking of the different features.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -reportWeight = PluginVariable( - name="Report Weight", - id="report_weight", - description="Weights to specify how relevant is the f1, precision and recall for the ranking of the different features with respect to MCC which is a more general measures of the performance of a model.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -differenceWeight = PluginVariable( - name="Difference Weight", - id="difference_weight", - description="How important is to have similar training and test metrics.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -bestModels = PluginVariable( - name="Best Models", - id="best_models", - description="The number of best models to select, it affects the analysis and the saved hyperparameters.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -seedVar = PluginVariable( - name="Seed", - id="seed", - description="The seed for the random state.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -dropVar = PluginVariable( - name="Drop", - id="drop", - description="The models to drop.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=[ - "lr", - "knn", - "nb", - "dt", - "svm", - "rbfsvm", - "gpc", - "mlp", - "ridge", - "rf", - "qda", - "ada", - "gbc", - "lda", - "et", - "xgboost", - "lightgbm", - "catboost", - "dummy", - ], -) -selectedVar = PluginVariable( - name="Selected", - id="selected", - description="The models to select.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=[ - "lr", - "knn", - "nb", - "dt", - "svm", - "rbfsvm", - "gpc", - "mlp", - "ridge", - "rf", - "qda", - "ada", - "gbc", - "lda", - "et", - "xgboost", - "lightgbm", - "catboost", - "dummy", - ], -) -tuneVar = PluginVariable( - name="Tune", - id="tune", - description="If to tune the best models.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -plotVar = PluginVariable( - name="Plot", - id="plot", - description="The plots to save.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["learning", "confusion_matrix", "class_report", "pr", "auc"], -) -optimizeVar = PluginVariable( - name="Optimize", - id="optimize", - description="The metric to optimize for retuning the best models.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["MCC", "Prec.", "Recall", "F1", "AUC", "Accuracy", "Average Precision Score"], -) -sheetName = PluginVariable( - name="Sheet Name", - id="sheet_name", - description="The sheet name for the excel file if the training features is in excel format.", - type=VariableTypes.STRING, - defaultValue=None, -) -numIter = PluginVariable( - name="Number of Iterations", - id="num_iter", - description="The number of iterations for the hyperparameter search.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -splitStrategy = PluginVariable( - name="Split Strategy", - id="split_strategy", - description="The strategy to split the data.", - type=VariableTypes.STRING_LIST, - defaultValue=None, - allowedValues=["mutations", "cluster", "stratifiedkfold", "kfold"], -) -clusterVar = PluginVariable( - name="Cluster", - id="cluster", - description="The path to the cluster file generated by mmseqs2 or a custom group index file just like data/resultsDB_clu.tsv.", - type=VariableTypes.FILE, - defaultValue=None, -) -mutationsVar = PluginVariable( - name="Mutations", - id="mutations", - description="The column name of the mutations in the training data.", - type=VariableTypes.STRING, - defaultValue=None, -) -testNumMutations = PluginVariable( - name="Test Number of Mutations", - id="test_num_mutations", - description="The threshold of number of mutations to be included in the test set.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -greaterVar = PluginVariable( - name="Greater", - id="greater", - description="Include in the test set, mutations that are greater of less than the threshold, default greater.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -shuffleVar = PluginVariable( - name="Shuffle", - id="shuffle", - description="If to shuffle the data before splitting.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) -crossValidation = PluginVariable( - name="Cross Validation", - id="cross_validation", - description="If to use cross validation, default is True.", - type=VariableTypes.BOOLEAN, - defaultValue=None, -) - - -def runClassificationBioml(block: SlurmBlock): - - input_excel = block.inputs.get("input_excel", None) - if input_excel is None: - raise Exception("No input excel provided") - if not os.path.exists(input_excel): - raise Exception(f"The input excel file does not exist: {input_excel}") - - input_hyperparameters = block.inputs.get("input_hyperparameters", None) - if input_hyperparameters is None: - raise Exception("No input hyperparameters provided") - if not os.path.exists(input_hyperparameters): - raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") - - input_sheets = block.inputs.get("input_sheets", None) - if input_sheets is None: - raise Exception("No input sheets provided") - - input_label = block.inputs.get("input_label", None) - if input_label is None: - raise Exception("No input label provided") - if not os.path.exists(input_label): - raise Exception(f"The input label file does not exist: {input_label}") - - command = "python -m BioML.Ensemble " - command += f"--excel {input_excel} " - command += f"--hyperparameter_path {input_hyperparameters} " - command += f"--sheets {input_sheets} " - command += f"--label {input_label} " - - jobs = [command] - - folderName = block.variables.get("folder_name", "ensembleBioml") - block.extraData["folder_name"] = folderName - removeExisting = block.variables.get("remove_existing_results", False) - - # If folder already exists, raise exception - if removeExisting and os.path.exists(folderName): - os.system("rm -rf " + folderName) - - if not removeExisting and os.path.exists(folderName): - raise Exception( - "The folder {} already exists. Please, choose another name or remove it.".format( - folderName - ) - ) - - # Create an copy the inputs - os.makedirs(folderName, exist_ok=True) - os.system(f"cp {input_excel} {folderName}") - os.system(f"cp {input_hyperparameters} {folderName}") - os.system(f"cp {input_sheets} {folderName}") - os.system(f"cp {input_label} {folderName}") - - from utils import launchCalculationAction - - launchCalculationAction( - block, - jobs, - program="bioml", - uploadFolders=[ - folderName, - ], - ) - - -def finalAction(block: SlurmBlock): - pass - - -from utils import BSC_JOB_VARIABLES - -regressionBioMLBlock = SlurmBlock( - name="Regression BioMl", - initialAction=runClassificationBioml, - finalAction=finalAction, - description="Train Regression models.", - inputGroups=[fileGroup, stringGroup], - variables=BSC_JOB_VARIABLES - + [ - selectedVar, - dropVar, - trainingOutput, - scalerVar, - kfoldParameters, - outliersVar, - budgetTime, - precisionWeight, - recallWeight, - reportWeight, - differenceWeight, - bestModels, - seedVar, - tuneVar, - plotVar, - optimizeVar, - sheetName, - numIter, - splitStrategy, - clusterVar, - mutationsVar, - testNumMutations, - greaterVar, - shuffleVar, - crossValidation, - ], - outputs=[outputClassification], -) diff --git a/EAPM/Include/Blocks/SetupDockingGrid.py b/EAPM/Include/Blocks/SetupDockingGrid.py index dbdb4c4..4a41dfd 100644 --- a/EAPM/Include/Blocks/SetupDockingGrid.py +++ b/EAPM/Include/Blocks/SetupDockingGrid.py @@ -146,7 +146,7 @@ def glideDocking(block: SlurmBlock): from utils import launchCalculationAction - launchCalculationAction(block, jobs, "glide", ["grid"]) + launchCalculationAction(block, jobs, "schrodinger", ["grid"]) def downloadGridResults(block: SlurmBlock): diff --git a/EAPM/Include/Blocks/SetupGlide.py b/EAPM/Include/Blocks/SetupGlide.py index b6153fd..1107585 100644 --- a/EAPM/Include/Blocks/SetupGlide.py +++ b/EAPM/Include/Blocks/SetupGlide.py @@ -133,7 +133,7 @@ def setupGlideDocking(block: SlurmBlock): from utils import launchCalculationAction launchCalculationAction( - block, jobs, "glide", uploadFolders=["docking", "grid", relative_ligand_folder] + block, jobs, "schrodinger", uploadFolders=["docking", "grid", relative_ligand_folder] ) diff --git a/EAPM/Include/Blocks/TrimAlphafoldModels.py b/EAPM/Include/Blocks/TrimAlphafoldModels.py index 57df227..2d96444 100644 --- a/EAPM/Include/Blocks/TrimAlphafoldModels.py +++ b/EAPM/Include/Blocks/TrimAlphafoldModels.py @@ -81,7 +81,7 @@ def trimAlphaFoldModels(block: PluginBlock): trimmed_folder = os.path.join(os.getcwd(), "trimmed_models") # Set the output - block.setOutput("trimmed_models", trimmed_folder) + block.setOutput(trimmedModelsOutputAF.id, trimmed_folder) outPdb = None for file in os.listdir(trimmed_folder): @@ -89,7 +89,7 @@ def trimAlphaFoldModels(block: PluginBlock): outPdb = os.path.join(trimmed_folder, file) break - block.setOutput("out_pdb", outPdb) + block.setOutput(outputPDBAF.id, outPdb) trimAlphaFoldModelsBlock = PluginBlock( diff --git a/EAPM/Include/Blocks/analyseGlideFelip.py b/EAPM/Include/Blocks/analyseGlideFelip.py new file mode 100644 index 0000000..2f5994b --- /dev/null +++ b/EAPM/Include/Blocks/analyseGlideFelip.py @@ -0,0 +1,104 @@ +import json +import os +import shutil + +import bsc_calculations +import pandas as pd +import prepare_proteins + +from HorusAPI import PluginBlock, PluginVariable, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +fasta_fileAF = PluginVariable( + name="Fasta file", + id="fasta_file", + description="The input fasta file.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["fasta"], +) + +# Output variables +outputModelsVariable = PluginVariable( + id="models", + name="Alphafold models", + description="The output models", + type=VariableTypes.FOLDER, +) + + +def finalAction(block: PluginBlock): + + models = prepare_proteins.proteinModels("models") + + with open("lig_atom_name.json", "r") as f: + lig_atom_name = json.load(f) + + triads = {} + triads["FeLip9"] = [111, 190, 167] + + triad_atoms = {} + triad_atoms["ser_OG"] = {} + triad_atoms["his_ND1"] = {} + triad_atoms["his_NE2"] = {} + triad_atoms["asp_OD1"] = {} + triad_atoms["asp_OD2"] = {} + + for model in models: # Iterate the models inside the library + S = triads[model][0] + H = triads[model][1] + D = triads[model][2] + for r in models.structures[ + model + ].get_residues(): # Iterate the residues for each Bio.PDB.Structure object + if r.id[1] == S: # Check that the residue matches the defined index + assert ( + r.resname == "SER" + ) # Assert that the residue has the correct residue identity + triad_atoms["ser_OG"][model] = ( + r.get_parent().id, + r.id[1], + "OG", + ) # Store the corresponsing tuple. + elif r.id[1] == H: + assert r.resname == "HIS" + triad_atoms["his_ND1"][model] = (r.get_parent().id, r.id[1], "ND1") + triad_atoms["his_NE2"][model] = (r.get_parent().id, r.id[1], "NE2") + elif r.id[1] == D: + assert r.resname == "ASP" + triad_atoms["asp_OD1"][model] = (r.get_parent().id, r.id[1], "OD1") + triad_atoms["asp_OD2"][model] = (r.get_parent().id, r.id[1], "OD2") + + atom_pairs = {} # Define the dictionary containing the atom pairs for each model + for model in models: + atom_pairs[model] = {} + atom_pairs[model]["PET"] = [] + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[2]["C1"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[2]["C8"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[4]["C1"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[4]["C8"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[6]["C1"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[6]["C8"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[8]["C1"])) + atom_pairs[model]["PET"].append((triad_atoms["ser_OG"][model], lig_atom_name[8]["C8"])) + + models.analyseDocking("docking", atom_pairs=atom_pairs) + + metric_distances = {} # Define the global dictionary + metric_distances["OG_C"] = {} # Define the metric nested dictionary + + for model in models: + metric_distances["OG_C"][model] = {} # Define the model nested dictionary + for ligand in models.docking_ligands[model]: + # Define the ligand nested dictionary with all the docking distances list + metric_distances["OG_C"][model][ligand] = models.getDockingDistances(model, ligand) + + models.combineDockingDistancesIntoMetrics(metric_distances) + + best_poses = models.getBestDockingPosesIteratively(metric_distances) + + models.extractDockingPoses(best_poses, "docking", "best_docking_poses") + + block.setOutput(outputModelsVariable.id, "best_docking_poses") diff --git a/EAPM/Include/Blocks/outliersBioMl.py b/EAPM/Include/Blocks/outliersBioMl.py deleted file mode 100644 index d402e4a..0000000 --- a/EAPM/Include/Blocks/outliersBioMl.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -A module that performs regression analysis on a dataset. -""" - -import os - -from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes - -# ==========================# -# Variable inputs -# ==========================# -excelFile = PluginVariable( - name="Excel file", - id="excel_file", - description="The file to where the selected features are saved in excel format.", - type=VariableTypes.FILE, - defaultValue=None, - allowedValues=["xlsx"], -) - - -# ==========================# -# Variable outputs -# ==========================# -outputOutliers = PluginVariable( - name="Outliers output", - id="out_zip", - description="The path to the output for the outliers.", - type=VariableTypes.FILE, -) - -############################## -# Other variables # -############################## -numThreads = PluginVariable( - name="Number of threads", - id="num_threads", - description="The number of threads to use.", - type=VariableTypes.INTEGER, - defaultValue=None, -) -scalerVar = PluginVariable( - name="Scaler", - id="scaler", - description="The scaler to use.", - type=VariableTypes.STRING, - defaultValue="StandardScaler", - allowedValues=["StandardScaler", "MinMaxScaler", "RobustScaler"], -) -contaminationVar = PluginVariable( - name="Contamination", - id="contamination", - description="The contamination value.", - type=VariableTypes.FLOAT, - defaultValue=None, -) -numFeatures = PluginVariable( - name="Number of features", - id="num_features", - description="The number of features to use.", - type=VariableTypes.INTEGER, - defaultValue=None, -) - - -def runClassificationBioml(block: SlurmBlock): - - input_excel = block.inputs.get("input_excel", None) - if input_excel is None: - raise Exception("No input excel provided") - if not os.path.exists(input_excel): - raise Exception(f"The input excel file does not exist: {input_excel}") - - input_hyperparameters = block.inputs.get("input_hyperparameters", None) - if input_hyperparameters is None: - raise Exception("No input hyperparameters provided") - if not os.path.exists(input_hyperparameters): - raise Exception(f"The input hyperparameters file does not exist: {input_hyperparameters}") - - input_sheets = block.inputs.get("input_sheets", None) - if input_sheets is None: - raise Exception("No input sheets provided") - - input_label = block.inputs.get("input_label", None) - if input_label is None: - raise Exception("No input label provided") - if not os.path.exists(input_label): - raise Exception(f"The input label file does not exist: {input_label}") - - command = "python -m BioML.Ensemble " - command += f"--excel {input_excel} " - command += f"--hyperparameter_path {input_hyperparameters} " - command += f"--sheets {input_sheets} " - command += f"--label {input_label} " - - jobs = [command] - - folderName = block.variables.get("folder_name", "ensembleBioml") - block.extraData["folder_name"] = folderName - removeExisting = block.variables.get("remove_existing_results", False) - - # If folder already exists, raise exception - if removeExisting and os.path.exists(folderName): - os.system("rm -rf " + folderName) - - if not removeExisting and os.path.exists(folderName): - raise Exception( - "The folder {} already exists. Please, choose another name or remove it.".format( - folderName - ) - ) - - # Create an copy the inputs - os.makedirs(folderName, exist_ok=True) - os.system(f"cp {input_excel} {folderName}") - os.system(f"cp {input_hyperparameters} {folderName}") - os.system(f"cp {input_sheets} {folderName}") - os.system(f"cp {input_label} {folderName}") - - from utils import launchCalculationAction - - launchCalculationAction( - block, - jobs, - program="bioml", - uploadFolders=[ - folderName, - ], - ) - - -def finalAction(block: SlurmBlock): - pass - - -from utils import BSC_JOB_VARIABLES - -outliersBioMLBlock = SlurmBlock( - name="Regression BioMl", - initialAction=runClassificationBioml, - finalAction=finalAction, - description="Train Regression models.", - inputs=[excelFile], - variables=BSC_JOB_VARIABLES - + [ - numThreads, - scalerVar, - contaminationVar, - numFeatures, - ], - outputs=[outputOutliers], -) diff --git a/EAPM/Include/Blocks/splitMethodsBioMl.py b/EAPM/Include/Blocks/splitMethodsBioMl.py deleted file mode 100644 index e69de29..0000000 diff --git a/EAPM/Include/Blocks/testBlock.py b/EAPM/Include/Blocks/testBlock.py new file mode 100644 index 0000000..36b14cb --- /dev/null +++ b/EAPM/Include/Blocks/testBlock.py @@ -0,0 +1,75 @@ +import os +import shutil + +from HorusAPI import PluginBlock, PluginVariable, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +fasta_fileAF = PluginVariable( + name="Fasta file", + id="fasta_file", + description="The input fasta file.", + type=VariableTypes.FILE, + defaultValue=None, + allowedValues=["fasta"], +) + +# ==========================# +# Variables +# ==========================# +outputAF = PluginVariable( + name="Alphafold simulation folder", + id="folder_name", + description="The name of the folder where the simulation will be stored.", + type=VariableTypes.STRING, + defaultValue="alphafold", +) +removeExistingResults = PluginVariable( + name="Remove existing results", + id="remove_existing_results", + description="Remove existing results", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) + +# Output variables +outputModelsVariable = PluginVariable( + id="models", + name="Alphafold models", + description="The output models", + type=VariableTypes.FOLDER, +) + + +def finalAlhafoldAction(block: PluginBlock): + + resultsFolder = "alphafold" + downloaded_path = "/home/perry/data/acanella/testHorus/all_test/alphafold" + + output_models_folder = os.path.join(downloaded_path, resultsFolder, "output_models") + + if not os.path.exists("structures"): + os.makedirs("structures") + + rank = 0 + for model in os.listdir(output_models_folder): + if os.path.exists(f"{output_models_folder}/" + model + "/ranked_" + str(rank) + ".pdb"): + shutil.copyfile( + f"{output_models_folder}/" + model + "/ranked_" + str(rank) + ".pdb", + "structures/" + model + ".pdb", + ) + + block.setOutput(outputModelsVariable.id, "structures") + + +from utils import BSC_JOB_VARIABLES + +testBlock = PluginBlock( + name="Test block", + description="Test", + action=finalAlhafoldAction, + variables=BSC_JOB_VARIABLES + [outputAF, removeExistingResults], + inputs=[fasta_fileAF], + outputs=[outputModelsVariable], +) diff --git a/EAPM/Include/Configs/hmmerConfig.py b/EAPM/Include/Configs/hmmerConfig.py index 42e6858..b3c5322 100644 --- a/EAPM/Include/Configs/hmmerConfig.py +++ b/EAPM/Include/Configs/hmmerConfig.py @@ -15,7 +15,7 @@ def checkHmmerInstallation(block: PluginConfig): print("verifying HMMER installation") # Get the path to the mafft executable - hmmerPath = block.variables.get("HMMER_path") + hmmerPath = block.variables.get(hmmerPathVariable.id) # Check if the path is valid if not os.path.isfile(hmmerPath): diff --git a/EAPM/Include/Configs/mafftConfig.py b/EAPM/Include/Configs/mafftConfig.py index 17cb210..c27eda4 100644 --- a/EAPM/Include/Configs/mafftConfig.py +++ b/EAPM/Include/Configs/mafftConfig.py @@ -15,7 +15,7 @@ def checkMAFFTInstallation(block: PluginConfig): print("verifying MAFFT installation") # Get the path to the mafft executable - mafftPath = block.variables.get("MAFFT_path") + mafftPath = block.variables.get(mafftPathVariable.id) # Check if the path is valid if not os.path.isfile(mafftPath): diff --git a/EAPM/Include/utils.py b/EAPM/Include/utils.py index 8f8a92e..9201443 100644 --- a/EAPM/Include/utils.py +++ b/EAPM/Include/utils.py @@ -4,7 +4,8 @@ import subprocess import typing -from HorusAPI import PluginBlock, PluginVariable, SlurmBlock, VariableList, VariableTypes +from HorusAPI import (PluginBlock, PluginVariable, SlurmBlock, VariableList, + VariableTypes) localIPs = {"cactus": "84.88.51.217", "blossom": "84.88.51.250", "bubbles": "84.88.51.219"} @@ -19,6 +20,7 @@ def setup_bsc_calculations_based_on_horus_remote( job_name, program, modulePurge, + cpus_per_task, ): import bsc_calculations @@ -32,7 +34,13 @@ def setup_bsc_calculations_based_on_horus_remote( # If we are working with pele, only marenostrum and nord3 are allowed if program == "pele": - if cluster not in ["mn1.bsc.es", "mn2.bsc.es", "mn3.bsc.es", "nord3.bsc.es"]: + if cluster not in [ + "glogin1.bsc.es", + "glogin4.bsc.es", + "glogin3.bsc.es", + "glogin4.bsc.es", + "nord3.bsc.es", + ]: raise Exception("Pele can only be run on Marenostrum or Nord3") if cluster == "nord3.bsc.es": @@ -43,8 +51,8 @@ def setup_bsc_calculations_based_on_horus_remote( general_script=scriptName, scripts_folder=scriptName + "_scripts", ) - elif "mn" in cluster: - bsc_calculations.marenostrum.setUpPELEForMarenostrum( + elif "glogin" in cluster: + bsc_calculations.mn5.setUpPELEForMarenostrum( jobs, partition=partition, cpus=cpus, @@ -56,26 +64,27 @@ def setup_bsc_calculations_based_on_horus_remote( ## Define cluster # cte_power - if cluster == "plogin1.bsc.es": - bsc_calculations.cte_power.jobArrays( - jobs, - job_name=job_name, - partition=partition, - program=program, - script_name=scriptName, - gpus=cpus, - module_purge=modulePurge, - ) + # if cluster == "plogin1.bsc.es": + # bsc_calculations.cte_power.jobArrays( + # jobs, + # job_name=job_name, + # partition=partition, + # program=program, + # script_name=scriptName, + # gpus=cpus, + # module_purge=modulePurge, + # ) # marenostrum - elif "mn" in cluster: + elif "glogin" in cluster or "alogin" in cluster: print("Generating Marenostrum jobs...") - bsc_calculations.marenostrum.jobArrays( + bsc_calculations.mn5.jobArrays( jobs, job_name=job_name, partition=partition, program=program, script_name=scriptName, - cpus=cpus, + ntasks=cpus, + cpus_per_task=cpus_per_task, module_purge=modulePurge, ) # minotauro @@ -91,7 +100,7 @@ def setup_bsc_calculations_based_on_horus_remote( module_purge=modulePurge, ) # nord3 - elif cluster == "nord3.bsc.es": + elif "nord" in cluster: print("Generating nord3 jobs...") bsc_calculations.nord3.jobArrays( jobs, @@ -102,6 +111,18 @@ def setup_bsc_calculations_based_on_horus_remote( cpus=cpus, module_purge=modulePurge, ) + # cte-amd + elif "amdlogin" in cluster: + print("Generating cte-amd jobs...") + bsc_calculations.amd.jobArrays( + jobs, + job_name=job_name, + partition=partition, + program=program, + script_name=scriptName, + cpus=cpus, + #module_purge=modulePurge, + ) # powerpuff elif cluster == "powerpuff": print("Generating powerpuff girls jobs...") @@ -164,6 +185,7 @@ def launchCalculationAction( partition = block.variables.get("partition") cpus = block.variables.get("cpus") + cpus_per_task = block.variables.get("cpus_per_task") simulationName = block.variables.get("folder_name") scriptName = block.variables.get("script_name", "calculation_script.sh") @@ -184,6 +206,7 @@ def launchCalculationAction( simulationName, program, modulePurge, + cpus_per_task, ) # Read the environment variables @@ -406,8 +429,8 @@ def downloadResultsAction(block: SlurmBlock): id="partition", description="Partition where to lunch.", type=VariableTypes.STRING_LIST, - defaultValue="bsc_ls", - allowedValues=["bsc_ls", "debug"], + defaultValue="gp_bscls", + allowedValues=["gp_bscls", "gp_debug", "acc_bscls", "acc_debug", "debug", "bsc_ls"], category="Slurm configuration", ) @@ -420,6 +443,15 @@ def downloadResultsAction(block: SlurmBlock): category="Slurm configuration", ) +cpusPerTaskVariable = PluginVariable( + name="CPUs per task", + id="cpus_per_task", + description="Number of CPUs per task to use.", + type=VariableTypes.INTEGER, + defaultValue=1, + category="Slurm configuration", +) + removeFolderOnFinishVariable = PluginVariable( name="Remove remote folder on finish", id="remove_folder_on_finish", @@ -461,4 +493,5 @@ def downloadResultsAction(block: SlurmBlock): cpusVariable, environmentList, removeFolderOnFinishVariable, + cpusPerTaskVariable, ] diff --git a/EAPM/config/eapm.json b/EAPM/config/eapm.json index fd03e90..e4550b8 100644 --- a/EAPM/config/eapm.json +++ b/EAPM/config/eapm.json @@ -1,4 +1,4 @@ { - "mafft_path": "/home/albertcs/miniconda3/bin/mafft", + "mafft_path": "/home/perry/miniconda3/envs/horus/bin/mafft", "hmmer_path": "/gpfs/projects/bsc72/conda_envs/hmm/bin/hmmsearch" } \ No newline at end of file diff --git a/EAPM/plugin.meta b/EAPM/plugin.meta index 62dacb3..737f1dd 100644 --- a/EAPM/plugin.meta +++ b/EAPM/plugin.meta @@ -6,10 +6,10 @@ "pluginFile": "EAPM.py", "dependencies": [ "pyhmmer", + "numpy", "pandas", "scipy", "pyyaml", - "numpy", "matplotlib", "seaborn", "mdtraj", @@ -21,6 +21,20 @@ "git+https://github.com/Martin-Floor/bsc_calculations.git", "git+https://github.com/Martin-Floor/prepare_proteins.git", "git+https://github.com/Martin-Floor/PELE_scripts.git", - "biopython==1.81" + "biopython==1.81", + "scikit_learn", + "biopython", + "shap", + "matplotlib", + "openpyxl", + "pyod", + "combo", + "mlflow", + "optuna", + "transformers", + "optuna_integration", + "lightning", + "peft", + "datasets" ] } diff --git a/EAPM/preinst.sh b/EAPM/preinst.sh new file mode 100644 index 0000000..b19f3b8 --- /dev/null +++ b/EAPM/preinst.sh @@ -0,0 +1,3 @@ +pip install "pycaret[analysis, models]" --target deps + +pip install "werkzeug<=2.3.0" From d07c1d8a420387bd6609b4d9093033a8cbb1f17b Mon Sep 17 00:00:00 2001 From: AlbertCS Date: Fri, 31 May 2024 17:33:42 +0200 Subject: [PATCH 5/6] changes for the release --- EAPM/EAPM.py | 16 +- EAPM/Include/Blocks/Ahatool.py | 8 +- EAPM/Include/Blocks/AlignPdbEAPM.py | 18 ++- EAPM/Include/Blocks/AlphaFoldEAPM.py | 6 +- .../{AnalyseGlideGPX.py => AnalyseGlide.py} | 141 ++++++++++++----- EAPM/Include/Blocks/AnalyseGlideDocking.py | 9 +- EAPM/Include/Blocks/AsiteDesign.py | 11 +- EAPM/Include/Blocks/ConservedResiduesMSA.py | 6 +- EAPM/Include/Blocks/EpPred.py | 8 +- EAPM/Include/Blocks/HmmAlign.py | 13 +- EAPM/Include/Blocks/HmmBuild.py | 13 +- EAPM/Include/Blocks/HmmScan.py | 12 +- EAPM/Include/Blocks/HmmSearch.py | 13 +- EAPM/Include/Blocks/HmmSearchLocal.py | 35 ++--- EAPM/Include/Blocks/JackHmmer.py | 36 +++-- EAPM/Include/Blocks/MSA2HMM.py | 25 ++- EAPM/Include/Blocks/Mafft.py | 5 +- EAPM/Include/Blocks/PDBToMAE.py | 36 +++-- EAPM/Include/Blocks/PeleEAPM.py | 142 ++++++++++-------- EAPM/Include/Blocks/PrepWizardEAPM.py | 81 +++++++--- EAPM/Include/Blocks/Rbcavity.py | 93 ++++++++++++ EAPM/Include/Blocks/Rbdock.py | 128 ++++++++++++++++ EAPM/Include/Blocks/SetupDockingGrid.py | 6 +- EAPM/Include/Blocks/SetupGlide.py | 8 +- EAPM/Include/Blocks/TrimAlphafoldModels.py | 5 +- EAPM/Include/Blocks/testBlock.py | 5 +- EAPM/Include/Configs/hmmerConfig.py | 2 +- EAPM/Include/utils.py | 7 +- EAPM/config/eapm.json | 2 +- EAPM/preinst.sh | 4 +- 30 files changed, 650 insertions(+), 244 deletions(-) rename EAPM/Include/Blocks/{AnalyseGlideGPX.py => AnalyseGlide.py} (55%) create mode 100644 EAPM/Include/Blocks/Rbcavity.py create mode 100644 EAPM/Include/Blocks/Rbdock.py diff --git a/EAPM/EAPM.py b/EAPM/EAPM.py index a7ad8f3..d6f6c1c 100644 --- a/EAPM/EAPM.py +++ b/EAPM/EAPM.py @@ -95,13 +95,21 @@ def createPlugin(): eapmPlugin.addBlock(epPredBlock) - from Blocks.testBlock import testBlock # type: ignore + # from Blocks.testBlock import testBlock # type: ignore - eapmPlugin.addBlock(testBlock) + # eapmPlugin.addBlock(testBlock) - from Blocks.AnalyseGlideGPX import AnalyseGPXBlock # type: ignore + from Blocks.AnalyseGlide import AnalyseGBlock # type: ignore - eapmPlugin.addBlock(AnalyseGPXBlock) + eapmPlugin.addBlock(AnalyseGBlock) + + # from Blocks.Rbcavity import rbCavityBlock # type: ignore + + # eapmPlugin.addBlock(rbCavityBlock) + + # from Blocks.Rbdock import rbDockBlock # type: ignore + + # eapmPlugin.addBlock(rbDockBlock) # Add the configs from Configs.mafftConfig import mafftExecutableConfig # type: ignore diff --git a/EAPM/Include/Blocks/Ahatool.py b/EAPM/Include/Blocks/Ahatool.py index 1e6e64b..e1865e3 100644 --- a/EAPM/Include/Blocks/Ahatool.py +++ b/EAPM/Include/Blocks/Ahatool.py @@ -1,7 +1,3 @@ -import datetime -import os -import subprocess - from HorusAPI import PluginBlock, PluginVariable, VariableList, VariableTypes # TODO Add to the documentation @@ -85,6 +81,10 @@ def initialAction(block: PluginBlock): + import datetime + import os + import subprocess + container_name = block.inputs.get("container_name", "bsceapm/ahatool:2.2") input_fasta = block.inputs.get("input_fasta", None) fasta = os.path.basename(input_fasta) diff --git a/EAPM/Include/Blocks/AlignPdbEAPM.py b/EAPM/Include/Blocks/AlignPdbEAPM.py index 5593445..d3ef54a 100644 --- a/EAPM/Include/Blocks/AlignPdbEAPM.py +++ b/EAPM/Include/Blocks/AlignPdbEAPM.py @@ -124,26 +124,30 @@ def initialAlign(block: PluginBlock): alignmentMode = block.variables.get("alignment_mode", "aligned") referenceResidues = block.variables.get("reference_residues", []) + import prepare_proteins + + print("Loading PDB files...") + + models = prepare_proteins.proteinModels(inputFolder) + # Parse the chain indexes if chainIndexes is not None: chainIndexes = [x["chain_index"] for x in chainIndexes] else: chainIndexes = [0] + trajectory_chain_indexes = None # Parse the trajectory chain indexes if trajectoryChainIndexes is not None: trajectoryChainIndexes = [x["trajectory_chain_index"] for x in trajectoryChainIndexes] + trajectory_chain_indexes = {} + for i, model in enumerate(models.models_names): + trajectory_chain_indexes[model] = trajectoryChainIndexes[i] # Parse the reference residues if referenceResidues is not None: referenceResidues = [x["reference_residues"] for x in referenceResidues] - import prepare_proteins - - print("Loading PDB files...") - - models = prepare_proteins.proteinModels(inputFolder) - print("Aligning models...") import subprocess @@ -163,7 +167,7 @@ def hookSubprocessMafft(command, **kwargs): pdbReference, outputFolder, chain_indexes=chainIndexes, - trajectory_chain_indexes=trajectoryChainIndexes, + trajectory_chain_indexes=trajectory_chain_indexes, aligment_mode=alignmentMode, reference_residues=referenceResidues, verbose=True, diff --git a/EAPM/Include/Blocks/AlphaFoldEAPM.py b/EAPM/Include/Blocks/AlphaFoldEAPM.py index 815c9d8..9c80b10 100644 --- a/EAPM/Include/Blocks/AlphaFoldEAPM.py +++ b/EAPM/Include/Blocks/AlphaFoldEAPM.py @@ -2,8 +2,6 @@ Module containing the AlphaFold block for the EAPM plugin """ -import os - from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# @@ -72,6 +70,8 @@ def initialAlphafold(block: SlurmBlock): print("Alphafold requires an accelerated partition. Changing to acc_bscls.") block.variables["partition"] = "acc_bscls" + import os + # If folder already exists, raise exception if removeExisting and os.path.exists(folderName): os.system("rm -rf " + folderName) @@ -107,6 +107,8 @@ def finalAlhafoldAction(block: SlurmBlock): resultsFolder = block.extraData["folder_name"] + import os + output_models_folder = os.path.join(downloaded_path, resultsFolder, "output_models") block.setOutput(outputModelsVariable.id, output_models_folder) diff --git a/EAPM/Include/Blocks/AnalyseGlideGPX.py b/EAPM/Include/Blocks/AnalyseGlide.py similarity index 55% rename from EAPM/Include/Blocks/AnalyseGlideGPX.py rename to EAPM/Include/Blocks/AnalyseGlide.py index 425a48a..cd3c048 100644 --- a/EAPM/Include/Blocks/AnalyseGlideGPX.py +++ b/EAPM/Include/Blocks/AnalyseGlide.py @@ -1,11 +1,3 @@ -import json -import os -import shutil - -import bsc_calculations -import pandas as pd -import prepare_proteins - from HorusAPI import PluginBlock, PluginVariable, VariableGroup, VariableTypes # TODO Configure the inputs correctly @@ -28,10 +20,16 @@ defaultValue=None, ) residueProtein = PluginVariable( - name="Atom Protein", id="resi_id1", description="atom1", type=VariableTypes.ATOM + name="Atom Protein", + id="resi_id1", + description="Atom of the protein to calculate the distance to", + type=VariableTypes.ATOM, ) residueLigand = PluginVariable( - name="Atom Ligand", id="resi_id2", description="atom2", type=VariableTypes.ATOM + name="Atom Ligand", + id="resi_id2", + description="Atom of the ligand to calculate the distance to", + type=VariableTypes.ATOM, ) resNameProt = PluginVariable( name="Protein residue name", @@ -40,10 +38,10 @@ type=VariableTypes.STRING, defaultValue="CYS", ) -resNameLig = PluginVariable( - name="Ligand residue name", - id="res_name_lig", - description="The ligand residue name", +atomNameProt = PluginVariable( + name="Protein atomname", + id="atom_name_prot", + description="The protein atom name", type=VariableTypes.STRING, defaultValue="SG", ) @@ -54,12 +52,26 @@ type=VariableTypes.STRING, defaultValue="GSH", ) +atomNameLig = PluginVariable( + name="Ligand atom name", + id="atom_name_ligand", + description="The atom name of the ligand", + type=VariableTypes.STRING, + defaultValue="S1", +) stringGroup = VariableGroup( id="string_input", name="Input String", description="The input are in string", - variables=[conservedResidues, glideOutputVariable, resNameProt, resNameLig, ligandName], + variables=[ + conservedResidues, + glideOutputVariable, + resNameProt, + atomNameProt, + ligandName, + atomNameLig, + ], ) atomGroup = VariableGroup( id="atom_input", @@ -70,12 +82,18 @@ # Output variables outputModelsVariable = PluginVariable( - id="models", - name="Alphafold models", - description="The output models", + id="best_poses", + name="Best poses", + description="The best poses from the analysis", type=VariableTypes.FOLDER, ) - +analyseGlideOutputVariable = PluginVariable( + id="glide_results_output", + name="Glide results output", + description="Output results of the Glide analysis", + type=VariableTypes.CUSTOM, + allowedValues=["glide_output"], +) # ==========================# # Variable @@ -87,10 +105,26 @@ type=VariableTypes.STRING, defaultValue="SG_S", ) +removePreviousVar = PluginVariable( + name="Remove previous models", + id="remove_previous", + description="Remove previous", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) +separatorVar = PluginVariable( + name="Separator", + id="separator", + description="The separator", + type=VariableTypes.STRING, + defaultValue="@", +) def finalAction(block: PluginBlock): + import prepare_proteins + bsc_result = block.inputs.get(glideOutputVariable.id, None) folder_to_analyse = bsc_result["dock_folder"] model_folder = bsc_result["model_folder"] @@ -98,43 +132,58 @@ def finalAction(block: PluginBlock): conserved_indexes = block.inputs.get(conservedResidues.id, None) metrics = block.variables.get("metrics", "SG_S") + remove_previous = block.variables.get("remove_previous", False) + separator = block.variables.get("separator", "@") if block.selectedInputGroup == stringGroup.id: res_name_prot = block.inputs.get(resNameProt.id, "CYS") - res_name_lig = block.inputs.get(resNameLig.id, "SG") + atom_name_prot = block.inputs.get(atomNameProt.id, "SG") ligand_name = block.inputs.get(ligandName.id, "GSH") + atom_name_lig = block.inputs.get(atomNameLig.id, "S1") else: residue_protein = block.inputs.get(residueProtein.id, None) + res_name_prot = residue_protein["auth_comp_id"] + atom_name_prot = residue_protein["auth_atom_id"] residue_ligand = block.inputs.get(residueLigand.id, None) + ligand_name = residue_ligand["auth_comp_id"] + atom_name_lig = residue_ligand["auth_atom_id"] + metrics = f"{atom_name_prot}_{atom_name_lig}" models = prepare_proteins.proteinModels(model_folder) + if conserved_indexes is None: + raise ValueError("Conserved residues must be provided") + if not isinstance(conserved_indexes, dict): + try: + conserved_indexes = int(conserved_indexes) + except ValueError: + raise ValueError("Conserved indexes must be an integer or a dictionary of integers") + conserved_indexes_f = {} + for model in models: + conserved_indexes_f[model] = [conserved_indexes] + conserved_indexes = conserved_indexes_f + center_atom = {} # Create dictionary to store the atom 3-element tuple for each model for model in models: # Iterate the models inside the library # Iterate the residues for each Bio.PDB.Structure object for r in models.structures[model].get_residues(): # Check that the residue matches the defined index - aa = conserved_indexes[model] # for cons_ind in conserved_indexes[model]: if r.id[1] in conserved_indexes[model]: # Assert that the residue has the correct residue identity if r.resname == res_name_prot: # Store the corresponsing tuple. - center_atom[model] = (r.get_parent().id, r.id[1], res_name_lig) + center_atom[model] = (r.get_parent().id, r.id[1], atom_name_prot) break - print(f"center_atom: {center_atom}") - atom_pairs = {} # Define the dictionary containing the atom pairs for each model for model in models: atom_pairs[model] = {} for ligand in [ligand_name]: atom_pairs[model][ligand] = [] - atom_pairs[model][ligand].append((center_atom[model], "S1")) + atom_pairs[model][ligand].append((center_atom[model], atom_name_lig)) - print(f"Atom pairs: {atom_pairs}") - - models.analyseDocking(folder_to_analyse, atom_pairs=atom_pairs) + models.analyseDocking(folder_to_analyse, atom_pairs=atom_pairs, separator=separator) metric_distances = {} # Define the global dictionary metric_distances[metrics] = {} # Define the metric nested dictionary @@ -144,24 +193,38 @@ def finalAction(block: PluginBlock): # Define the ligand nested dictionary with all the docking distances list metric_distances[metrics][model][ligand] = models.getDockingDistances(model, ligand) - print(f"metric_distances: {metric_distances}") - models.combineDockingDistancesIntoMetrics(metric_distances) - print(f"models.docking_data: {models.docking_data}") - best_poses = models.getBestDockingPosesIteratively(metric_distances) - models.extractDockingPoses(best_poses, folder_to_analyse, "best_docking_poses", separator="@") + models.extractDockingPoses( + best_poses, + folder_to_analyse, + "best_docking_poses", + separator=separator, + remove_previous=remove_previous, + ) block.setOutput(outputModelsVariable.id, "best_docking_poses") + glideOutput = { + "poses_folder": "best_docking_poses", + "models_folder": model_folder, # "prepared_proteins", + "atom_pairs": atom_pairs, + } + import pickle + + with open("glide_output.pkl", "wb") as f: + pickle.dump(glideOutput, f) + + block.setOutput(analyseGlideOutputVariable.id, glideOutput) + -AnalyseGPXBlock = PluginBlock( - name="Analyse Glide GPX", - description="To analyse Glide GPX results", +AnalyseGBlock = PluginBlock( + name="Analyse Glide", + description="To analyse Glide results", action=finalAction, - variables=[metricsVar], - inputGroups=[stringGroup, atomGroup], - outputs=[outputModelsVariable], + variables=[metricsVar, removePreviousVar, separatorVar], + inputGroups=[atomGroup, stringGroup], + outputs=[outputModelsVariable, analyseGlideOutputVariable], ) diff --git a/EAPM/Include/Blocks/AnalyseGlideDocking.py b/EAPM/Include/Blocks/AnalyseGlideDocking.py index 504bc56..9d1056b 100644 --- a/EAPM/Include/Blocks/AnalyseGlideDocking.py +++ b/EAPM/Include/Blocks/AnalyseGlideDocking.py @@ -1,6 +1,3 @@ -import datetime -import os - from HorusAPI import ( Extensions, PluginBlock, @@ -133,6 +130,10 @@ def analyseDockingAction(block: PluginBlock): + + import datetime + import os + if block.selectedInputGroup == "folder_variable_group": folder_to_analyse = block.inputs.get("docking_folder", "docking") model_folder = block.inputs.get("model_folder", "models") @@ -391,6 +392,7 @@ def analyseDocking( """ import json + import os import pandas as pd import prepare_proteins @@ -529,6 +531,7 @@ def extractDockingPoses( Remove all content in the output folder """ + import os import shutil # Check the separator is not in model or ligand names diff --git a/EAPM/Include/Blocks/AsiteDesign.py b/EAPM/Include/Blocks/AsiteDesign.py index 1bb9b39..3998ec2 100644 --- a/EAPM/Include/Blocks/AsiteDesign.py +++ b/EAPM/Include/Blocks/AsiteDesign.py @@ -2,10 +2,7 @@ Module containing the Asitedesign block for the EAPM plugin """ -import os -import subprocess - -from HorusAPI import PluginVariable, SlurmBlock, VariableList, VariableTypes +from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# # Variable inputs @@ -72,6 +69,9 @@ def initialAsite(block: SlurmBlock): + import os + import subprocess + # Get the input variables input_yaml = block.inputs.get("input_yaml", None) input_params = block.inputs.get("input_params", None) @@ -110,6 +110,9 @@ def initialAsite(block: SlurmBlock): def finalAsiteAction(block: SlurmBlock): + + import os + from utils import downloadResultsAction downloaded_path = downloadResultsAction(block) diff --git a/EAPM/Include/Blocks/ConservedResiduesMSA.py b/EAPM/Include/Blocks/ConservedResiduesMSA.py index bee7ae6..de12d8b 100644 --- a/EAPM/Include/Blocks/ConservedResiduesMSA.py +++ b/EAPM/Include/Blocks/ConservedResiduesMSA.py @@ -25,7 +25,7 @@ def getConservedMSAPositions(block: PluginBlock): - proteinFolder = block.inputs.get("protein_folder", "proteins") + proteinFolder = block.inputs.get(proteinFolderVariable.id, "proteins") # Check that there is at least one pdb file in the folder import os @@ -73,7 +73,7 @@ def hookSubprocessMafft(command, **kwargs): Extensions().loadHTML(html, title="Conserved residues") # Get the residue index to get - residueIndexes = block.variables.get("residue_index", []) + residueIndexes = block.variables.get(residueIndexToGetVariable.id, []) if residueIndexes is None or len(residueIndexes) == 0: # Get all the indexes @@ -90,7 +90,7 @@ def hookSubprocessMafft(command, **kwargs): if len(conservedResidues[model]) == 0: raise Exception( "There are no conserved residues for the selected indexes: " - + " ".join(residueIndexes) + + " ".join(str(residueIndexes)) ) break diff --git a/EAPM/Include/Blocks/EpPred.py b/EAPM/Include/Blocks/EpPred.py index 1d385a2..705d77b 100644 --- a/EAPM/Include/Blocks/EpPred.py +++ b/EAPM/Include/Blocks/EpPred.py @@ -1,8 +1,4 @@ -import datetime -import os -import subprocess - -from HorusAPI import PluginVariable, SlurmBlock, VariableList, VariableTypes +from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # TODO Making the block to work in marenostrum, if not, will work in local. # TODO Add to documentation @@ -199,6 +195,8 @@ def runEppred(block: SlurmBlock): + import os + inputfasta = block.inputs.get("input_fasta", None) if inputfasta is None: diff --git a/EAPM/Include/Blocks/HmmAlign.py b/EAPM/Include/Blocks/HmmAlign.py index d494eb1..b993faa 100644 --- a/EAPM/Include/Blocks/HmmAlign.py +++ b/EAPM/Include/Blocks/HmmAlign.py @@ -2,8 +2,6 @@ Module containing the HmmAlign block for the EAPM plugin as a nord3 implementation """ -import os - from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# @@ -52,6 +50,8 @@ def runHmmAlign(block: SlurmBlock): + import os + inputfasta = block.inputs.get("input_fasta", None) inputhmm = block.inputs.get("input_hmm", None) @@ -88,7 +88,12 @@ def runHmmAlign(block: SlurmBlock): os.system(f"cp {inputfasta} {folderName}") os.system(f"cp {inputhmm} {folderName}") - jobs = [f"hmmalign {folderName}/{inputhmm} {folderName}/{inputfasta}"] + if block.remote.isLocal: + hmmerExecutable = block.config.get("hmmer_path", "hmmer") + "/hmmalign" + else: + hmmerExecutable = "hmmalign" + + jobs = [f"{hmmerExecutable} {folderName}/{inputhmm} {folderName}/{inputfasta}"] from utils import launchCalculationAction @@ -103,6 +108,8 @@ def runHmmAlign(block: SlurmBlock): def finalAction(block: SlurmBlock): + import os + from utils import downloadResultsAction downloaded_path = downloadResultsAction(block) diff --git a/EAPM/Include/Blocks/HmmBuild.py b/EAPM/Include/Blocks/HmmBuild.py index 6d5a3cc..9f54e7c 100644 --- a/EAPM/Include/Blocks/HmmBuild.py +++ b/EAPM/Include/Blocks/HmmBuild.py @@ -2,8 +2,6 @@ Module containing the HmmBuild block for the EAPM plugin as a nord3 implementation """ -import os - from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# @@ -45,6 +43,8 @@ def runHmmBuild(block: SlurmBlock): + import os + input = block.inputs.get("input_msa", None) if "nord3" not in block.remote.host: @@ -76,7 +76,12 @@ def runHmmBuild(block: SlurmBlock): output = block.outputs.get("output", "output.hmm") - jobs = [f"hmmbuild {folderName}/{output} {folderName}/{input}"] + if block.remote.isLocal: + hmmerExecutable = block.config.get("hmmer_path", "hmmer") + "/hmmbuild" + else: + hmmerExecutable = "hmmbuild" + + jobs = [f"{hmmerExecutable} {folderName}/{output} {folderName}/{input}"] from utils import launchCalculationAction @@ -91,6 +96,8 @@ def runHmmBuild(block: SlurmBlock): def finalAction(block: SlurmBlock): + import os + from utils import downloadResultsAction downloaded_path = downloadResultsAction(block) diff --git a/EAPM/Include/Blocks/HmmScan.py b/EAPM/Include/Blocks/HmmScan.py index 7dc939a..208228f 100644 --- a/EAPM/Include/Blocks/HmmScan.py +++ b/EAPM/Include/Blocks/HmmScan.py @@ -2,8 +2,6 @@ Module containing the HmmScan block for the EAPM plugin as a nord3 implementation """ -import os - from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# @@ -50,6 +48,7 @@ def runHmmScan(block: SlurmBlock): + import os input = block.inputs.get("input_fasta", None) @@ -83,7 +82,12 @@ def runHmmScan(block: SlurmBlock): hmmDB = block.variables.get("hmm_db", None) output = block.outputs.get("output", "output.hmm") - jobs = [f"hmmscan {hmmDB} {folderName}/{input} -o {folderName}/{output}"] + if block.remote.isLocal: + hmmerExecutable = block.config.get("hmmer_path", "hmmer") + "/hmmscan" + else: + hmmerExecutable = "hmmscan" + + jobs = [f"{hmmerExecutable} {hmmDB} {folderName}/{input} -o {folderName}/{output}"] from utils import launchCalculationAction @@ -98,6 +102,8 @@ def runHmmScan(block: SlurmBlock): def finalAction(block: SlurmBlock): + import os + from utils import downloadResultsAction downloaded_path = downloadResultsAction(block) diff --git a/EAPM/Include/Blocks/HmmSearch.py b/EAPM/Include/Blocks/HmmSearch.py index 10d6fb2..4c46c30 100644 --- a/EAPM/Include/Blocks/HmmSearch.py +++ b/EAPM/Include/Blocks/HmmSearch.py @@ -2,8 +2,6 @@ Module containing the HmmSearch block for the EAPM plugin as a nord3 implementation """ -import os - from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# @@ -58,6 +56,8 @@ def runHmmSearch(block: SlurmBlock): + import os + input = block.inputs.get("input_hmm", None) if "nord3" not in block.remote.host: @@ -94,8 +94,13 @@ def runHmmSearch(block: SlurmBlock): "sequence_db", "/gpfs/projects/shared/public/AlphaFold/uniref90/uniref90.fa" ) + if block.remote.isLocal: + hmmerExecutable = block.config.get("hmmer_path", "hmmer") + "/hmmsearch" + else: + hmmerExecutable = "hmmsearch" + jobs = [ - f"hmmsearch --cpu {cpus} -E {evalue} {folderName}/{input} {sequenceDB} -o {folderName}/{output}" + f"{hmmerExecutable} --cpu {cpus} -E {evalue} {folderName}/{input} {sequenceDB} -o {folderName}/{output}" ] from utils import launchCalculationAction @@ -111,6 +116,8 @@ def runHmmSearch(block: SlurmBlock): def finalAction(block: SlurmBlock): + import os + from utils import downloadResultsAction downloaded_path = downloadResultsAction(block) diff --git a/EAPM/Include/Blocks/HmmSearchLocal.py b/EAPM/Include/Blocks/HmmSearchLocal.py index bd6da5e..b8eeffa 100644 --- a/EAPM/Include/Blocks/HmmSearchLocal.py +++ b/EAPM/Include/Blocks/HmmSearchLocal.py @@ -2,10 +2,7 @@ Module containing the HmmSearch block for the EAPM plugin as a local implementation """ -import os -import pyhmmer -from HorusAPI import PluginBlock, PluginVariable, VariableTypes, Extensions - +from HorusAPI import Extensions, PluginBlock, PluginVariable, VariableTypes # ==========================# # Variable inputs @@ -38,47 +35,51 @@ allowedValues=["domtbl"], ) + def runHmmSearch(block: PluginBlock): - + import os + + import pyhmmer + input = block.inputs.get("input_hmm", None) - + if input is None: raise Exception("No input hmm provided") - + if not os.path.exists(input): raise Exception(f"The input hmm file does not exist: {input}") - + try: with pyhmmer.plan7.HMMFile(input) as hmm_file: hmm = hmm_file.read() except Exception as e: raise Exception(f"Error reading the input hmm file: {e}") - + alphabet = pyhmmer.plan7.Alphabet.amino() background = pyhmmer.plan7.Background(alphabet) pipeline = pyhmmer.plan7.Pipeline(alphabet, background=background) - + sequenceDB = block.inputs.get("sequence_db", None) - + if sequenceDB is None: raise Exception("No sequence database provided") - + if not os.path.exists(sequenceDB): raise Exception(f"The sequence database file does not exist: {sequenceDB}") - + try: with pyhmmer.easel.SequenceFile(sequenceDB, digital=True, alphabet=alphabet) as seq_file: hits = pipeline.search_hmm(hmm, seq_file) except Exception as e: raise Exception(f"Error searching the sequence database: {e}") - + output = block.outputs.get("output", "output.domtbl") - + with open(output, "wb") as f: hits.write(f, format="domains") - + block.setOutput("outputVariable", output) - + hmmsearchLocalBlock = PluginBlock( name="HmmSearch Local", diff --git a/EAPM/Include/Blocks/JackHmmer.py b/EAPM/Include/Blocks/JackHmmer.py index f14d47d..b2f01f9 100644 --- a/EAPM/Include/Blocks/JackHmmer.py +++ b/EAPM/Include/Blocks/JackHmmer.py @@ -2,8 +2,6 @@ Module containing the JackHmmer block for the EAPM plugin as a nord3 implementation """ -import os - from HorusAPI import PluginVariable, SlurmBlock, VariableTypes # ==========================# @@ -44,17 +42,25 @@ id="sequence_db", name="Sequence DB", description="The sequence database to search", - type=VariableTypes.STRING, - defaultValue="/gpfs/projects/shared/public/AlphaFold/uniref90/uniref90.fa", + type=VariableTypes.FILE, + defaultValue="/apps/ACC/ALPHAFOLD/SRC/database/Alphafold/uniref90/uniref90.fasta", +) +folderNameVar = PluginVariable( + id="folder_name", + name="Folder name", + description="The folder name", + type=VariableTypes.FOLDER, + defaultValue="jackHmmer", ) def runJackHmmer(block: SlurmBlock): + import os inputfasta = block.inputs.get("input_fasta", None) - if "nord3" not in block.remote.host: - raise Exception("This block only works on Nord3.") + # if "nord3" not in block.remote.host or "glogin" not in block.remote.host: + # raise Exception("This block only works on Nord3 or mn.") if inputfasta is None: raise Exception("No input fasta provided") @@ -71,7 +77,7 @@ def runJackHmmer(block: SlurmBlock): if not removeExisting and os.path.exists(folderName): raise Exception( - "The folder {} already exists. Please, choose another name or remove it.".format( + "The folder {} already exists. Please, choose another name or remove it with the RemoveExistingFolder option.".format( folderName ) ) @@ -80,15 +86,19 @@ def runJackHmmer(block: SlurmBlock): os.makedirs(folderName, exist_ok=True) os.system(f"cp {inputfasta} {folderName}") + inputfasta = os.path.join(folderName, os.path.basename(inputfasta)) + output = block.outputs.get("output", "output.hmm") sequenceDB = block.variables.get( - "sequence_db", "/gpfs/projects/shared/public/AlphaFold/uniref90/uniref90.fa" + "sequence_db", "/apps/ACC/ALPHAFOLD/SRC/database/Alphafold/uniref90/uniref90.fasta" ) cpus = block.variables.get("cpus", 1) - jobs = [ - f"jackhmmer -o {folderName}/{output} --cpu {cpus} {folderName}/{inputfasta} {sequenceDB}" - ] + if block.remote.isLocal: + hmmerExecutable = block.config.get("hmmer_path", "hmmer") + "/jackhmmer" + else: + hmmerExecutable = "jackhmmer" + jobs = [f"{hmmerExecutable} -o {folderName}/{output} --cpu {cpus} {inputfasta} {sequenceDB}"] from utils import launchCalculationAction @@ -103,6 +113,8 @@ def runJackHmmer(block: SlurmBlock): def finalAction(block: SlurmBlock): + import os + from utils import downloadResultsAction downloaded_path = downloadResultsAction(block) @@ -122,6 +134,6 @@ def finalAction(block: SlurmBlock): finalAction=finalAction, description="Iteratively search a protein sequence against a protein database", inputs=[fastaInput], - variables=BSC_JOB_VARIABLES + [sequenceDBVar, removeExistingResults], + variables=BSC_JOB_VARIABLES + [sequenceDBVar, removeExistingResults, folderNameVar], outputs=[outputVariable], ) diff --git a/EAPM/Include/Blocks/MSA2HMM.py b/EAPM/Include/Blocks/MSA2HMM.py index dcdcf60..d1c0d7d 100644 --- a/EAPM/Include/Blocks/MSA2HMM.py +++ b/EAPM/Include/Blocks/MSA2HMM.py @@ -2,11 +2,7 @@ Module containing the MSA2HMM block for the EAPM plugin """ -import pyhmmer -import os - -from HorusAPI import PluginBlock, PluginVariable, VariableTypes, VariableGroup - +from HorusAPI import PluginBlock, PluginVariable, VariableGroup, VariableTypes # ==========================# # Variable inputs @@ -44,36 +40,39 @@ # ==========================# - def convertMSA2HMM(block: PluginBlock): """ Convert MSA to HMM """ + import os + + import pyhmmer # Loading plugin variables inputMSA = block.inputs.get("input_file_msa") if inputMSA is None: raise Exception("No input MSA provided") - + if not os.path.exists(inputMSA): raise Exception(f"The input MSA file does not exist: {inputMSA}") - + alphabet = pyhmmer.easel.Alphabet.amino() - + with pyhmmer.easel.MSAFile(inputMSA, digital=True, alphabet=alphabet) as msa_file: msa = msa_file.read() msa.name = b"input_msa" - + builder = pyhmmer.plan7.Builder(alphabet) background = pyhmmer.plan7.Background(alphabet) hmm, _, _ = builder.build_msa(msa, background) - + output = "output.hmm" with open(output, "wb") as output_file: hmm.write(output_file) - + block.setOutput("output_hmm", output) - + + convertMSAToHMMBlock = PluginBlock( name="MSA to HMM", description="Convert MSA files to HMM", diff --git a/EAPM/Include/Blocks/Mafft.py b/EAPM/Include/Blocks/Mafft.py index 473c19e..b1e764d 100644 --- a/EAPM/Include/Blocks/Mafft.py +++ b/EAPM/Include/Blocks/Mafft.py @@ -2,10 +2,7 @@ Module containing the Mafft block for the EAPM plugin """ -import Bio.AlignIO -import Bio.SeqIO - -from HorusAPI import Extensions, PluginBlock, PluginVariable, VariableTypes +from HorusAPI import PluginBlock, PluginVariable, VariableTypes # ==========================# # Variable inputs diff --git a/EAPM/Include/Blocks/PDBToMAE.py b/EAPM/Include/Blocks/PDBToMAE.py index e033459..4f0f6bb 100644 --- a/EAPM/Include/Blocks/PDBToMAE.py +++ b/EAPM/Include/Blocks/PDBToMAE.py @@ -1,6 +1,3 @@ -import os -import shutil - from HorusAPI import PluginBlock, PluginVariable, VariableGroup, VariableTypes # Input variables @@ -45,6 +42,9 @@ def convertPDBToMAE(block: PluginBlock): + import os + import shutil + # Test if we have valid glide installation command = "echo $SCHRODINGER" output = block.remote.remoteCommand(command) @@ -58,7 +58,23 @@ def convertPDBToMAE(block: PluginBlock): import prepare_proteins - pdb_folder = block.inputs.get("pdb_folder", None) + if block.selectedInputGroup == singlePDBVariable.id: + pdb_file = block.inputs.get("single_pdb", None) + + if pdb_file is None: + raise Exception("No PDB file selected") + + if not os.path.isfile(pdb_file): + raise Exception(f"Invalid PDB file: {pdb_file}") + + if os.path.exists("tmp_ligand"): + shutil.rmtree("tmp_ligand") + os.mkdir("tmp_ligand") + shutil.copy(pdb_file, "tmp_ligand") + + pdb_folder = os.path.join(os.getcwd(), "tmp_ligand") + else: + pdb_folder = block.inputs.get("pdb_folder", None) if pdb_folder is None: raise Exception("No PDB folder selected") @@ -163,12 +179,6 @@ def mockSystem(command): name="PDB to MAE", description="Convert PDB files to MAE for Glide", inputGroups=[ - VariableGroup( - id=structureVariable.id, - name=structureVariable.name, - description=structureVariable.description, - variables=[structureVariable], - ), VariableGroup( id=singlePDBVariable.id, name=singlePDBVariable.name, @@ -181,6 +191,12 @@ def mockSystem(command): description=pdbFolderVariable.description, variables=[pdbFolderVariable], ), + VariableGroup( + id=structureVariable.id, + name=structureVariable.name, + description=structureVariable.description, + variables=[structureVariable], + ), ], variables=[changeLigandNameVariable], outputs=[outputVariable], diff --git a/EAPM/Include/Blocks/PeleEAPM.py b/EAPM/Include/Blocks/PeleEAPM.py index ddb9136..636fec9 100644 --- a/EAPM/Include/Blocks/PeleEAPM.py +++ b/EAPM/Include/Blocks/PeleEAPM.py @@ -1,4 +1,4 @@ -import random + from HorusAPI import (PluginVariable, SlurmBlock, VariableGroup, VariableList, VariableTypes) @@ -9,7 +9,7 @@ name="PELE yaml", description="YAML file containing the PELE configuration", type=VariableTypes.FILE, - defaultValue="cst_input.yaml", + defaultValue="input.yaml", allowedValues=["yaml"], ) @@ -25,7 +25,7 @@ id="poses_folder", name="Best docking poses", description="Best docking poses to analyse", - type=VariableTypes.FOLDER + type=VariableTypes.FOLDER, ) glideOutputVariable = PluginVariable( @@ -41,7 +41,7 @@ id="folder_input_group", name="Folder input group", description="Input the model and ligand folders after a Dcoking Grid setup has been run", - variables=[modelFolderVariable, posesFolderVariable, yamlPELEFileVariable] + variables=[modelFolderVariable, posesFolderVariable, yamlPELEFileVariable], ) glideOutputGroup = VariableGroup( @@ -168,7 +168,7 @@ name="PELE separator", description="Separator for the PELE models and ligands", type=VariableTypes.STRING, - defaultValue="-", + defaultValue="@", category="PELE", ) @@ -326,7 +326,7 @@ description="Enable log file", type=VariableTypes.BOOLEAN, defaultValue=False, - category="PELE" + category="PELE", ) rescoringVariable = PluginVariable( @@ -335,7 +335,7 @@ description="Enable rescoring", type=VariableTypes.BOOLEAN, defaultValue=False, - category="PELE" + category="PELE", ) epsilonVariable = PluginVariable( @@ -344,7 +344,7 @@ description="TODO Epsilon description", type=VariableTypes.FLOAT, defaultValue=0.5, - category="PELE" + category="PELE", ) ligandEquilibrationCstVariable = PluginVariable( @@ -353,7 +353,7 @@ description="TODO Ligand equilibration cst description", type=VariableTypes.BOOLEAN, defaultValue=True, - category="PELE" + category="PELE", ) covalentSetupVariable = PluginVariable( @@ -362,7 +362,7 @@ description="Enable covalent setup", type=VariableTypes.BOOLEAN, defaultValue=False, - category="PELE" + category="PELE", ) nonbondedNewFlagVariable = PluginVariable( @@ -371,7 +371,7 @@ description="Enable nonbonded new flag", type=VariableTypes.BOOLEAN, defaultValue=False, - category="PELE" + category="PELE", ) onlyModelsVariable = PluginVariable( @@ -414,12 +414,12 @@ category="PELE", ) -membraneResiduesVariable= PluginVariable( +membraneResiduesVariable = PluginVariable( id="membrane_residues", name="Membrane residues", description="TODO membrane residues description", type=VariableTypes.LIST, - category="PELE" + category="PELE", ) biasToPointVariable = PluginVariable( @@ -427,7 +427,7 @@ name="Bias to point", description="TODO bias_to_point description", type=VariableTypes.LIST, - category="PELE" + category="PELE", ) comBias1Variable = PluginVariable( @@ -435,7 +435,7 @@ name="com bias1", description="TODO com_bias1 description", type=VariableTypes.LIST, - category="PELE" + category="PELE", ) comBias2Variable = PluginVariable( @@ -443,7 +443,7 @@ name="com bias2", description="TODO com_bias2 description", type=VariableTypes.LIST, - category="PELE" + category="PELE", ) ligandTemplateVariable = PluginVariable( @@ -478,35 +478,35 @@ id="model", name="Model", description="TODO model variable description", - type=VariableTypes.STRING + type=VariableTypes.STRING, ) ligandVariable = PluginVariable( id="ligand", name="Ligand", description="TODO ligand variable description", - type=VariableTypes.STRING + type=VariableTypes.STRING, ) chainVariable = PluginVariable( id="chain", name="Chain", description="TODO chain variable description", - type=VariableTypes.STRING + type=VariableTypes.STRING, ) residueVariable = PluginVariable( id="residue", name="Residue number", description="TODO residue number variable description", - type=VariableTypes.INTEGER + type=VariableTypes.INTEGER, ) atomNameVariable = PluginVariable( id="atom_name", name="Atom name", description="TODO atom name variable description", - type=VariableTypes.STRING + type=VariableTypes.STRING, ) # box_centers VariableList @@ -515,13 +515,7 @@ name="Box centers", description="TODO Box center variable description", category="PELE", - prototypes=[ - modelVariable, - ligandVariable, - chainVariable, - residueVariable, - atomNameVariable - ], + prototypes=[modelVariable, ligandVariable, chainVariable, residueVariable, atomNameVariable], ) # Outputs @@ -533,9 +527,13 @@ ) - def peleAction(block: SlurmBlock): if block.selectedInputGroup == "glide_output_group": + glide_outputr = block.inputs.get("glide_output") + # load the pickle file + + # with open(glide_outputr, "rb") as f: + # glide_output = pickle.load(f) glide_output = block.inputs.get("glide_output") poses_folder = glide_output.get("poses_folder") models_folder = glide_output.get("models_folder") @@ -549,7 +547,7 @@ def peleAction(block: SlurmBlock): atom_pairs = {} # Get all the variables from the block - boxCentersValue = block.variables.get("box_centers", []) + boxCentersValue = block.variables.get("box_centers", None) boxRadiusValue = block.variables.get("box_radius", 10) constraintsValue = block.variables.get("constraints", []) ligandIndexValue = block.variables.get("ligand_index", 1) @@ -558,7 +556,7 @@ def peleAction(block: SlurmBlock): peleIterationsValue = block.variables.get("pele_iterations", 5) equilibrationStepsValue = block.variables.get("equilibration_steps", 100) ligandEnergyGroupsValue = block.variables.get("ligand_energy_groups", []) - peleSeparatorValue = block.variables.get("pele_separator", "-") + peleSeparatorValue = block.variables.get("pele_separator", "@") usePeleffyValue = block.variables.get("use_peleffy", True) useSrunValue = block.variables.get("use_srun", True) energyByResidueValue = block.variables.get("energy_by_residue", False) @@ -579,7 +577,7 @@ def peleAction(block: SlurmBlock): onlyModelsValue = block.variables.get("only_models", []) onlyLigandsValue = block.variables.get("only_ligands", []) onlyCombinationsValue = block.variables.get("only_combinations", []) - nonbondedEnergyValue = block.variables.get('nonbonded_energy', {}) + nonbondedEnergyValue = block.variables.get("nonbonded_energy", {}) ligandTemplateValue = block.variables.get("ligand_template", "") seedValue = block.variables.get("seed", -1) logFileValue = block.variables.get("log_file", False) @@ -595,19 +593,31 @@ def peleAction(block: SlurmBlock): comBias2Value = block.variables.get("com_bias2", {}) # Parse spawningValue - validSpawnings = ['independent', 'inverselyProportional', 'epsilon', 'variableEpsilon', - 'independentMetric', 'UCB', 'FAST', 'ProbabilityMSM', 'MetastabilityMSM', - 'IndependentMSM'] - + validSpawnings = [ + "independent", + "inverselyProportional", + "epsilon", + "variableEpsilon", + "independentMetric", + "UCB", + "FAST", + "ProbabilityMSM", + "MetastabilityMSM", + "IndependentMSM", + ] + if spawningValue != None and spawningValue not in validSpawnings: - message = 'Spawning method %s not found.' % spawningValue - message = 'Allowed options are: ' + str(validSpawnings) - raise ValueError(message) + message = "Spawning method %s not found." % spawningValue + message = "Allowed options are: " + str(validSpawnings) + raise ValueError(message) # Parse energyByResidueValue - energy_by_residue_types = ['all', 'lennard_jones', 'sgb', 'electrostatic'] + energy_by_residue_types = ["all", "lennard_jones", "sgb", "electrostatic"] if energyByResidueTypeValue not in energy_by_residue_types: - raise ValueError('%s not found. Try: %s' % (energyByResidueTypeValue, energy_by_residue_types)) + raise ValueError( + "%s not found. Try: %s" % (energyByResidueTypeValue, energy_by_residue_types) + ) + import random # Parse seedValue if seedValue == -1: @@ -616,44 +626,49 @@ def peleAction(block: SlurmBlock): # Parse ligandEnergyGroups if not isinstance(ligandEnergyGroupsValue, type(None)): if not isinstance(ligandEnergyGroupsValue, dict): - raise ValueError('Ligand energy groups, must be given as a dictionary') - - # Parse box_centers + raise ValueError("Ligand energy groups, must be given as a dictionary") - box_centers = {} - for model in boxCentersValue: - box_centers[(model['model'], model['ligand'])] = (model['chain'], model['residue'], model['atom_name']) + # Parse box_centers + if not isinstance(boxCentersValue, type(None)) or boxCentersValue is not None: + box_centers = {} + for model in boxCentersValue: + box_centers[(model["model"], model["ligand"])] = ( + model["chain"], + model["residue"], + model["atom_name"], + ) + else: + box_centers = None # Parse skip_models if not isinstance(skipModelsValue, type(None)): if not isinstance(skipModelsValue, list): - raise ValueError('skip_models must be a list.') + raise ValueError("skip_models must be a list.") - # Parse skip_ligands if not isinstance(skipLigandsValue, type(None)): if not isinstance(skipLigandsValue, list): - raise ValueError('skip_ligands must be a list.') + raise ValueError("skip_ligands must be a list.") # Parse nonbonded_energy if not isinstance(nonbondedEnergyValue, type(None)): if not isinstance(nonbondedEnergyValue, dict): - raise ValueError('nonbonded_energy, must be given as a dictionary') + raise ValueError("nonbonded_energy, must be given as a dictionary") # Parse only_ligands if not isinstance(onlyLigandsValue, type(None)): if not isinstance(onlyLigandsValue, list): - raise ValueError('only_ligands must be a list.') - + raise ValueError("only_ligands must be a list.") + # Parse only_models if not isinstance(onlyModelsValue, type(None)): if not isinstance(onlyModelsValue, list): - raise ValueError('only_models must be a list.') + raise ValueError("only_models must be a list.") # Parse only_combinations if not isinstance(onlyCombinationsValue, type(None)): if not isinstance(onlyCombinationsValue, list): - raise ValueError('only_combinations must be a list.') + raise ValueError("only_combinations must be a list.") import prepare_proteins @@ -663,7 +678,7 @@ def peleAction(block: SlurmBlock): selections = block.variables.get("selections_list", []) if atom_pairs == {}: groups = [] - for model in models: + for model in models: atom_pairs[model] = {} for selection in selections: current_group = selection["group"] @@ -686,7 +701,7 @@ def peleAction(block: SlurmBlock): atom_pairs[model][ligandName] = [] atom_pairs[model][ligandName].append((protein_tuple, ligand_atom)) - cst_yaml = block.inputs.get("yaml_pele_file") + input_yaml = block.inputs.get("yaml_pele_file") cpus = block.variables.get("cpus", 48) peleFolderName = block.variables.get("pele_folder_name", "pele") @@ -694,7 +709,7 @@ def peleAction(block: SlurmBlock): jobs = models.setUpPELECalculation( peleFolderName, poses_folder, - cst_yaml, + input_yaml, box_radius=boxRadiusValue, iterations=peleIterationsValue, cpus=cpus, @@ -738,7 +753,7 @@ def peleAction(block: SlurmBlock): bias_to_point=biasToPointValue, com_bias1=comBias1Value, com_bias2=comBias2Value, - ligand_energy_groups=ligandEnergyGroupsValue + ligand_energy_groups=ligandEnergyGroupsValue, ) from utils import launchCalculationAction @@ -754,7 +769,7 @@ def peleAction(block: SlurmBlock): ) -def peleFinalAction(block: SlurmBlock):# +def peleFinalAction(block: SlurmBlock): # print("Pele finished") from utils import downloadResultsAction @@ -766,7 +781,6 @@ def peleFinalAction(block: SlurmBlock):# block.setOutput("pele_output_folder", peleFolderName) - from utils import BSC_JOB_VARIABLES blockVariables = BSC_JOB_VARIABLES + [ @@ -813,18 +827,20 @@ def peleFinalAction(block: SlurmBlock):# membraneResiduesVariable, biasToPointVariable, comBias1Variable, - comBias2Variable - + comBias2Variable, ] + def wrappedFunction(block: SlurmBlock): try: peleAction(block) except Exception as e: import traceback + print("Exception:", e) traceback.print_exc() + peleBlock = SlurmBlock( name="PELE", description="Run PELE", diff --git a/EAPM/Include/Blocks/PrepWizardEAPM.py b/EAPM/Include/Blocks/PrepWizardEAPM.py index 08e903a..70774a2 100644 --- a/EAPM/Include/Blocks/PrepWizardEAPM.py +++ b/EAPM/Include/Blocks/PrepWizardEAPM.py @@ -2,9 +2,7 @@ Module containing the PrepWizard block for the EAPM plugin """ -import os - -from HorusAPI import PluginVariable, SlurmBlock, VariableTypes +from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableTypes # ==========================# # Variable inputs @@ -16,6 +14,26 @@ type=VariableTypes.FOLDER, defaultValue=None, ) +inputFilePW = PluginVariable( + name="Input File", + id="input_file", + description="File of the pdb to prepare.", + type=VariableTypes.FILE, + allowedValues=["pdb"], +) +folderVariableGroup = VariableGroup( + id="folder_variable_group", + name="Folder variable group", + description="Input folder with the models.", + variables=[inputFolderPW], +) +fileVariableGroup = VariableGroup( + id="file_output_variable_group", + name="PDB file group", + description="Input PDB file.", + variables=[inputFilePW], +) + # ==========================# # Variable outputs @@ -26,6 +44,13 @@ description="Folder containing the prepared proteins.", type=VariableTypes.FOLDER, ) +outputPDB = PluginVariable( + name="Output PDB", + id="out_pdb", + description="Last PDB of the Prepwizard.", + type=VariableTypes.FILE, + allowedValues=["pdb"], +) ############################## # Block's advanced variables # @@ -39,6 +64,7 @@ ) +# Variables phPW = PluginVariable( name="PH", id="ph", @@ -112,28 +138,36 @@ def prepWizardAction(block: SlurmBlock): Args: block (SlurmBlock): The block to run the action on. """ - # Loading plugin variables - inputFolder = block.inputs.get("input_folder", None) - if inputFolder is None: - raise Exception("No input folder provided.") + + import os + + if block.selectedInputGroup == fileVariableGroup.id: + input_file = block.inputs.get(inputFilePW.id, None) + input_folder = "models" + os.makedirs(input_folder, exist_ok=True) + os.system(f"cp {input_file} {input_folder}") + elif block.selectedInputGroup == folderVariableGroup.id: + input_folder = block.inputs.get(inputFolderPW.id, None) + else: + raise Exception("No input selected") # Get prepWizard variables - folderName = block.variables.get("folder_name", "prepared_proteins") - ph = int(block.variables.get("ph", 7)) - epikPH = block.variables.get("epik_ph", False) - sampleWater = block.variables.get("sample_water", False) - removeHydrogens = block.variables.get("remove_hydrogens", False) - delWaterHbondCutOff = block.variables.get("del_water_hbond_cut_off", False) - fillLoops = block.variables.get("fill_loops", False) - protonationStates = block.variables.get("protonation_states", None) - noepik = block.variables.get("no_epik", False) - noProtAssign = block.variables.get("no_prot_assign", False) + folderName = block.variables.get(folderNameVariable.id, "prepared_proteins") + ph = int(block.variables.get(phPW.id, 7)) + epikPH = block.variables.get(epikPHPW.id, False) + sampleWater = block.variables.get(sampleWaterPW.id, False) + removeHydrogens = block.variables.get(removeHydrogensPW.id, False) + delWaterHbondCutOff = block.variables.get(delWaterHbondCutOffPW.id, False) + fillLoops = block.variables.get(fillLoopsPW.id, False) + protonationStates = block.variables.get(protonationStatesPW.id, None) + noepik = block.variables.get(noepikPW.id, False) + noProtAssign = block.variables.get(noProtAssignPW.id, False) import prepare_proteins print("Loading pdbs files...") - models = prepare_proteins.proteinModels(inputFolder) + models = prepare_proteins.proteinModels(input_folder) print("Setting up PrepWizard Optimitzations...") @@ -173,11 +207,13 @@ def prepWizardAction(block: SlurmBlock): def downloadPrepWizardResults(block: SlurmBlock): + import os + from utils import downloadResultsAction downloadResultsAction(block) - folderName = block.variables.get("folder_name", "prepared_proteins") + folderName = block.variables.get(folderNameVariable.id, "prepared_proteins") # Create the output folder containing the prepared proteins if not os.path.exists(folderName): @@ -193,7 +229,8 @@ def downloadPrepWizardResults(block: SlurmBlock): pdbPath = os.path.join(folderName + "_wizard", "output_models", model, file) shutil.copyfile(pdbPath, finalPath) - block.setOutput("prepared_proteins", folderName) + block.setOutput(outputPDB.id, finalPath) + block.setOutput(outputPW.id, folderName) from utils import BSC_JOB_VARIABLES @@ -218,6 +255,6 @@ def downloadPrepWizardResults(block: SlurmBlock): initialAction=prepWizardAction, finalAction=downloadPrepWizardResults, variables=block_variables, - inputs=[inputFolderPW], - outputs=[outputPW], + inputGroups=[folderVariableGroup, fileVariableGroup], + outputs=[outputPDB, outputPW], ) diff --git a/EAPM/Include/Blocks/Rbcavity.py b/EAPM/Include/Blocks/Rbcavity.py new file mode 100644 index 0000000..04b89ac --- /dev/null +++ b/EAPM/Include/Blocks/Rbcavity.py @@ -0,0 +1,93 @@ +""" +Module containing the rbcavity block for the EAPM plugin +""" + +from HorusAPI import PluginBlock, PluginVariable, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +inputPRMFile = PluginVariable( + name="Parameter file", + id="input_prm_file", + description="The input '.prm' file.", + type=VariableTypes.FILE, + defaultValue="parameter_file.prm", + allowedValues=["prm"], +) + +# ==========================# +# Variable outputs +# ==========================# +outputLog = PluginVariable( + name="Output log", + id="output_log", + description="The output log file.", + type=VariableTypes.FILE, + defaultValue="parameter_file.log", +) + + +############################## +# Other variables # +############################## +was = PluginVariable( + name="Was", + id="was", + description="Write docking cavities (plus distance grid) to .as file.", + type=VariableTypes.BOOLEAN, + defaultValue=True, +) +dumpInsight = PluginVariable( + name="Dump Insight", + id="dump_insight", + description="Dump InsightII/PyMOL grids for each cavity for visualisation.", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) + + +# Align action block +def initialRbcavity(block: PluginBlock): + + if block.remote.name != "Local": + raise Exception("This block is only available for local execution.") + + # Loading plugin variables + input_PRMfile = block.inputs.get(inputPRMFile.id, None) + output_log = block.outputs.get(outputLog.id, "parameter_file.log") + + # rbcavity -was -d -r parameter_file.prm > parameter_file.log + command = "rbcavity " + if block.variables.get("was", True): + command += "-was " + if block.variables.get("dump_insight", False): + command += "-d " + command += f"-r {input_PRMfile} > {output_log}" + + print("Setting output of block to the results directory...") + + # subprocess the command + import subprocess + + completed_process = subprocess.run(command, shell=True, capture_output=True, text=True) + + # Get the output and error + output = completed_process.stdout + error = completed_process.stderr + + # Set the output + block.setOutput(outputLog.id, output_log) + + +rbCavityBlock = PluginBlock( + name="Rbcavity", + description="Calculate docking cavities. (For local)", + action=initialRbcavity, + variables=[ + was, + dumpInsight, + ], + inputs=[inputPRMFile], + outputs=[outputLog], +) diff --git a/EAPM/Include/Blocks/Rbdock.py b/EAPM/Include/Blocks/Rbdock.py new file mode 100644 index 0000000..38d0124 --- /dev/null +++ b/EAPM/Include/Blocks/Rbdock.py @@ -0,0 +1,128 @@ +""" +Module containing the rbdock block for the EAPM plugin +""" + +from HorusAPI import PluginBlock, PluginVariable, VariableTypes + +# ==========================# +# Variable inputs +# ==========================# +inputPRMFile = PluginVariable( + name="Parameter file", + id="input_prm_file", + description="The input '.prm' file.", + type=VariableTypes.FILE, + defaultValue="parameter_file.prm", + allowedValues=["prm"], +) +inputLigand = PluginVariable( + name="Ligand SD file", + id="input_ligand", + description="The input ligand SD file.", + type=VariableTypes.FILE, + allowedValues=["sd"], +) + +# ==========================# +# Variable outputs +# ==========================# +outputFile = PluginVariable( + name="Output File", + id="output_file", + description="The output file with the ligand docked.", + type=VariableTypes.FILE, + defaultValue="parameter_file", +) + + +############################## +# Other variables # +############################## +protoPrmFile = PluginVariable( + name="proto Prm File", + id="proto_prm_file", + description="The docking protocol parameter file.", + type=VariableTypes.FILE, + defaultValue="dock.prm", +) +nRuns = PluginVariable( + name="nRuns", + id="n_runs", + description="Number of runs/ligand (default=1).", + type=VariableTypes.INTEGER, + defaultValue=None, +) +allH = PluginVariable( + name="allH", + id="all_h", + description="Keep all hydrogens, read all hydrogens present (default=polar hydrogens only).", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) + + +# Align action block +def initialRbdock(block: PluginBlock): + + import os + + if block.remote.name != "Local": + raise Exception("This block is only available for local execution.") + + # Loading plugin variables + input_PRMfile = block.inputs.get(inputPRMFile.id, None) + if input_PRMfile is None: + raise Exception("No parameter file provided.") + if not os.path.exists(input_PRMfile): + raise Exception("Parameter file does not exist.") + input_ligand = block.inputs.get(inputLigand.id, None) + out = "output_dock" + if input_ligand is None: + raise Exception("No ligand file provided.") + if not os.path.exists(input_ligand): + raise Exception("Ligand file does not exist.") + else: + out = os.path.basename(input_ligand).split(".")[0] + "_out" + output_file = block.outputs.get(outputFile.id, out) + + # rbcavity -was -d -r parameter_file.prm > parameter_file.log + command = f"rbdock -i {input_ligand} -o {output_file} -r {input_PRMfile} " + if block.variables.get("proto_prm_file", None) is not None: + command += f"-p {block.variables.get('proto_prm_file')} " + if block.variables.get("n_runs", None) is not None: + command += f"-n {block.variables.get('n_runs')} " + if block.variables.get("all_h", False): + command += "-allH " + + print("Setting output of block to the results directory...") + + # Subprocess the command + import subprocess + + completed_process = subprocess.run(command, shell=True, capture_output=True, text=True) + + # Get the output and error + output = completed_process.stdout + # Save the output and error + with open(f"{output_file}.out", "w") as f: + f.write(output) + error = completed_process.stderr + with open(f"{output_file}.err", "w") as f: + f.write(error) + + # Set the output + block.setOutput(outputFile.id, output_file) + + +rbDockBlock = PluginBlock( + name="Rbdock", + description="Calculate the docking. (For local)", + action=initialRbdock, + variables=[ + protoPrmFile, + nRuns, + allH, + ], + inputs=[inputPRMFile, inputLigand], + outputs=[outputFile], +) diff --git a/EAPM/Include/Blocks/SetupDockingGrid.py b/EAPM/Include/Blocks/SetupDockingGrid.py index 4a41dfd..73b3898 100644 --- a/EAPM/Include/Blocks/SetupDockingGrid.py +++ b/EAPM/Include/Blocks/SetupDockingGrid.py @@ -1,6 +1,4 @@ -import os - -from HorusAPI import InputBlock, PluginVariable, SlurmBlock, VariableGroup, VariableTypes +from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableTypes # Input variables modelFolderVariable = PluginVariable( @@ -45,6 +43,8 @@ # Action def glideDocking(block: SlurmBlock): + import os + import prepare_proteins models_folder = block.inputs.get("model_folder") diff --git a/EAPM/Include/Blocks/SetupGlide.py b/EAPM/Include/Blocks/SetupGlide.py index 1107585..908875e 100644 --- a/EAPM/Include/Blocks/SetupGlide.py +++ b/EAPM/Include/Blocks/SetupGlide.py @@ -1,6 +1,3 @@ -import os -import shutil - from HorusAPI import PluginVariable, SlurmBlock, VariableGroup, VariableTypes # Input variables @@ -54,6 +51,9 @@ def setupGlideDocking(block: SlurmBlock): + import os + import shutil + import prepare_proteins if block.selectedInputGroup == "folder_input_group": @@ -138,6 +138,8 @@ def setupGlideDocking(block: SlurmBlock): def downloadGlideDocking(block: SlurmBlock): + import os + from utils import downloadResultsAction downloadResultsAction(block) diff --git a/EAPM/Include/Blocks/TrimAlphafoldModels.py b/EAPM/Include/Blocks/TrimAlphafoldModels.py index 2d96444..a24eae0 100644 --- a/EAPM/Include/Blocks/TrimAlphafoldModels.py +++ b/EAPM/Include/Blocks/TrimAlphafoldModels.py @@ -1,6 +1,3 @@ -import os -import shutil - from HorusAPI import PluginBlock, PluginVariable, VariableTypes resultsFolderAF = PluginVariable( @@ -37,6 +34,8 @@ def trimAlphaFoldModels(block: PluginBlock): + import os + import shutil # Get the models folder models_folder = block.inputs.get("results_folder", None) diff --git a/EAPM/Include/Blocks/testBlock.py b/EAPM/Include/Blocks/testBlock.py index 36b14cb..ded4d28 100644 --- a/EAPM/Include/Blocks/testBlock.py +++ b/EAPM/Include/Blocks/testBlock.py @@ -1,6 +1,3 @@ -import os -import shutil - from HorusAPI import PluginBlock, PluginVariable, VariableTypes # ==========================# @@ -43,6 +40,8 @@ def finalAlhafoldAction(block: PluginBlock): + import os + import shutil resultsFolder = "alphafold" downloaded_path = "/home/perry/data/acanella/testHorus/all_test/alphafold" diff --git a/EAPM/Include/Configs/hmmerConfig.py b/EAPM/Include/Configs/hmmerConfig.py index b3c5322..451b6e2 100644 --- a/EAPM/Include/Configs/hmmerConfig.py +++ b/EAPM/Include/Configs/hmmerConfig.py @@ -18,7 +18,7 @@ def checkHmmerInstallation(block: PluginConfig): hmmerPath = block.variables.get(hmmerPathVariable.id) # Check if the path is valid - if not os.path.isfile(hmmerPath): + if not os.path.isdir(hmmerPath): raise Exception("The HMMER executable path is not valid") diff --git a/EAPM/Include/utils.py b/EAPM/Include/utils.py index 9201443..87dc0ac 100644 --- a/EAPM/Include/utils.py +++ b/EAPM/Include/utils.py @@ -4,8 +4,7 @@ import subprocess import typing -from HorusAPI import (PluginBlock, PluginVariable, SlurmBlock, VariableList, - VariableTypes) +from HorusAPI import PluginBlock, PluginVariable, SlurmBlock, VariableList, VariableTypes localIPs = {"cactus": "84.88.51.217", "blossom": "84.88.51.250", "bubbles": "84.88.51.219"} @@ -36,7 +35,7 @@ def setup_bsc_calculations_based_on_horus_remote( if program == "pele": if cluster not in [ "glogin1.bsc.es", - "glogin4.bsc.es", + "glogin2.bsc.es", "glogin3.bsc.es", "glogin4.bsc.es", "nord3.bsc.es", @@ -121,7 +120,7 @@ def setup_bsc_calculations_based_on_horus_remote( program=program, script_name=scriptName, cpus=cpus, - #module_purge=modulePurge, + # module_purge=modulePurge, ) # powerpuff elif cluster == "powerpuff": diff --git a/EAPM/config/eapm.json b/EAPM/config/eapm.json index e4550b8..f9593d4 100644 --- a/EAPM/config/eapm.json +++ b/EAPM/config/eapm.json @@ -1,4 +1,4 @@ { "mafft_path": "/home/perry/miniconda3/envs/horus/bin/mafft", - "hmmer_path": "/gpfs/projects/bsc72/conda_envs/hmm/bin/hmmsearch" + "hmmer_path": "/home/perry/miniconda3/envs/horus/bin" } \ No newline at end of file diff --git a/EAPM/preinst.sh b/EAPM/preinst.sh index b19f3b8..f33cc76 100644 --- a/EAPM/preinst.sh +++ b/EAPM/preinst.sh @@ -1,3 +1,3 @@ -pip install "pycaret[analysis, models]" --target deps +# pip install "pycaret[analysis, models]" --target deps -pip install "werkzeug<=2.3.0" +# pip install "werkzeug<=2.3.0" From c2f33cc46f64b5d04b427e96ae7adc269369cea5 Mon Sep 17 00:00:00 2001 From: AlbertCS Date: Wed, 5 Jun 2024 15:35:06 +0200 Subject: [PATCH 6/6] new bug solving --- EAPM/Include/Blocks/AlphaFoldEAPM.py | 4 +- EAPM/Include/Blocks/AsiteDesign.py | 110 ++++++++++++++++---------- EAPM/Include/Blocks/PrepWizardEAPM.py | 5 ++ EAPM/config/eapm_Local.json | 4 + EAPM/config/eapm_acc.json | 4 + EAPM/config/eapm_nord3_test.json | 4 + 6 files changed, 88 insertions(+), 43 deletions(-) create mode 100644 EAPM/config/eapm_Local.json create mode 100644 EAPM/config/eapm_acc.json create mode 100644 EAPM/config/eapm_nord3_test.json diff --git a/EAPM/Include/Blocks/AlphaFoldEAPM.py b/EAPM/Include/Blocks/AlphaFoldEAPM.py index 9c80b10..6c14350 100644 --- a/EAPM/Include/Blocks/AlphaFoldEAPM.py +++ b/EAPM/Include/Blocks/AlphaFoldEAPM.py @@ -78,7 +78,7 @@ def initialAlphafold(block: SlurmBlock): if not removeExisting and os.path.exists(folderName): raise Exception( - "The folder {} already exists. Please, choose another name or remove it.".format( + "The folder {} already exists. Please, choose another name or remove it with the remove existing folder option.".format( folderName ) ) @@ -97,7 +97,7 @@ def initialAlphafold(block: SlurmBlock): from utils import launchCalculationAction - launchCalculationAction(block, jobs, folderName) + launchCalculationAction(block, jobs, "alphafold", [folderName]) def finalAlhafoldAction(block: SlurmBlock): diff --git a/EAPM/Include/Blocks/AsiteDesign.py b/EAPM/Include/Blocks/AsiteDesign.py index 3998ec2..ef0b2bd 100644 --- a/EAPM/Include/Blocks/AsiteDesign.py +++ b/EAPM/Include/Blocks/AsiteDesign.py @@ -31,41 +31,42 @@ defaultValue=None, ) + # ==========================# # Variable outputs # ==========================# -outputFolderAsite = PluginVariable( - name="Asite simulation folder", - id="folder_name", - description="The name of the folder where the simulation will be stored.", - type=VariableTypes.STRING, - defaultValue="AsiteDesign", -) outputModelsAsite = PluginVariable( name="Models", id="models", description="The models generated by the simulation.", type=VariableTypes.FOLDER, - defaultValue="DesignCatalyticSite_job_final_pose", + defaultValue="job_final_pose", ) ############################## # Other variables # ############################## -queue = PluginVariable( - name="Cluster queue", - id="partition", - description="The queue for the simulation", - type=VariableTypes.STRING, - defaultValue="bsc_ls", -) containerAsite = PluginVariable( name="Container", id="container", description="If you are launching the block in a container. The container to use.", - type=VariableTypes.STRING, + type=VariableTypes.FILE, defaultValue=None, ) +outputFolderAsite = PluginVariable( + name="Asite simulation folder", + id="folder_name", + description="The name of the folder where the simulation will be stored.", + type=VariableTypes.STRING, + defaultValue="AsiteDesign", +) +removeExistingResults = PluginVariable( + name="Remove existing results", + id="remove_existing_results", + description="Remove existing results", + type=VariableTypes.BOOLEAN, + defaultValue=False, +) def initialAsite(block: SlurmBlock): @@ -73,40 +74,69 @@ def initialAsite(block: SlurmBlock): import subprocess # Get the input variables - input_yaml = block.inputs.get("input_yaml", None) - input_params = block.inputs.get("input_params", None) - input_pdb = block.inputs.get("input_pdb", None) - cpus = block.variables.get("cpus", 0) + input_yaml = block.inputs.get(inputYamlAsite.id, None) + input_params = block.inputs.get(inputParamsAsite.id, None) + input_pdb = block.inputs.get(inputPDBAsite.id, None) container = block.variables.get("container", None) output_file = input_yaml.rstrip(".yaml").split("/")[-1] + ".out" + cpus = block.variables.get("cpus_per_task", 1) + + cwd = os.getcwd() - # copiar pdb and params to output folder - subprocess.run(["cp", input_yaml, os.getcwd()], check=True) - subprocess.run(["cp", input_pdb, os.getcwd()], check=True) - subprocess.run(["cp", "-r", input_params, os.getcwd()], check=True) + if not os.path.exists(input_yaml): + raise Exception(f"Input yaml file {input_yaml} not found") + if not os.path.exists(input_params): + raise Exception(f"Input parameters folder {input_params} not found") + if not os.path.exists(input_pdb): + raise Exception(f"Input pdb file {input_pdb} not found") - input_yaml = input_yaml.split("/")[-1] + # removeExisting = block.variables.get("remove_existing_results", False) + + # if removeExisting and os.path.exists(folder_name): + # os.system("rm -rf " + folder_name) + + # if not removeExisting and os.path.exists(folder_name): + # raise Exception( + # "The folder {} already exists. Please, choose another name or remove it.".format( + # folder_name + # ) + # ) + + # os.makedirs(folder_name, exist_ok=True) + + # # copiar pdb and params to output folder + # subprocess.run(["cp", input_yaml, os.path.join(os.getcwd(), folder_name)], check=True) + # subprocess.run(["cp", input_pdb, os.path.join(os.getcwd(), folder_name)], check=True) + # subprocess.run(["cp", "-r", input_params, os.path.join(os.getcwd(), folder_name)], check=True) + + # input_yaml = folder_name + "/" + os.path.basename(input_yaml) + # input_pdb = folder_name + "/" + os.path.basename(input_pdb) + # input_params = folder_name + "/" + os.path.basename(input_params) + + input_yaml = os.path.basename(input_yaml) cluster = "local" if block.remote.name != "local": cluster = block.remote.host - if "mn" in cluster: - job = f"mpirun -np {cpus} python -m ActiveSiteDesign {input_yaml} > {output_file}" + if "login" in cluster: + job = f"mpirun -n {cpus} python -m ActiveSiteDesign {input_yaml} > {output_file}" elif cluster == "local": if container is None: - job = f"mpirun -np {cpus} python -m ActiveSiteDesign {input_yaml} > {output_file}" + job = f"mpirun -n {cpus} python -m ActiveSiteDesign {input_yaml} > {output_file}" else: if cpus == 0: job = f"singularity exec {container} python -m ActiveSiteDesign {input_yaml} > {output_file}" else: - job = f"mpirun -np {cpus} singularity exec {container} python -m ActiveSiteDesign {input_yaml} > {output_file}" + job = f"mpirun -n {cpus} singularity exec {container} python -m ActiveSiteDesign {input_yaml} > {output_file}" else: - raise Exception("AsiteDesign can only be run on Marenostrum or local") + raise Exception("AsiteDesign can only be run on nord3 or local") from utils import launchCalculationAction - launchCalculationAction(block, [job], "asitedesign", modulePurge=True) + launchCalculationAction( + block, [job], "asitedesign", modulePurge=True # uploadFolders=folder_name, + ) def finalAsiteAction(block: SlurmBlock): @@ -117,24 +147,22 @@ def finalAsiteAction(block: SlurmBlock): downloaded_path = downloadResultsAction(block) - resultsFolder = block.outputs["folder_name"] - modelsFolder = block.outputs["models"] - - output_folder = os.path.join(downloaded_path, resultsFolder) - block.setOutput(outputFolderAsite.id, output_folder) + for f in os.listdir(downloaded_path): + if f.endswith("_final_pose"): + resultsFolder = f + break - output_Modelfolder = os.path.join(downloaded_path, f"{resultsFolder}/{modelsFolder}") - block.setOutput(outputModelsAsite.id, output_Modelfolder) + block.setOutput(outputModelsAsite.id, os.path.join(downloaded_path, resultsFolder)) from utils import BSC_JOB_VARIABLES asiteDesignBlock = SlurmBlock( name="AsiteDesign", - description="Run AsiteDesign. (For local or marenostrum)", + description="Run AsiteDesign. (For local or nord3)", initialAction=initialAsite, finalAction=finalAsiteAction, - variables=BSC_JOB_VARIABLES + [containerAsite], + variables=BSC_JOB_VARIABLES + [containerAsite, outputFolderAsite, removeExistingResults], inputs=[inputYamlAsite, inputPDBAsite, inputParamsAsite], - outputs=[outputFolderAsite], + outputs=[outputModelsAsite], ) diff --git a/EAPM/Include/Blocks/PrepWizardEAPM.py b/EAPM/Include/Blocks/PrepWizardEAPM.py index 70774a2..3fce5e6 100644 --- a/EAPM/Include/Blocks/PrepWizardEAPM.py +++ b/EAPM/Include/Blocks/PrepWizardEAPM.py @@ -140,10 +140,13 @@ def prepWizardAction(block: SlurmBlock): """ import os + import time if block.selectedInputGroup == fileVariableGroup.id: input_file = block.inputs.get(inputFilePW.id, None) input_folder = "models" + if os.path.exists(input_folder): + input_folder = input_folder + "_" + str(time.time()) os.makedirs(input_folder, exist_ok=True) os.system(f"cp {input_file} {input_folder}") elif block.selectedInputGroup == folderVariableGroup.id: @@ -153,6 +156,8 @@ def prepWizardAction(block: SlurmBlock): # Get prepWizard variables folderName = block.variables.get(folderNameVariable.id, "prepared_proteins") + if os.path.exists(folderName): + folderName = folderName + "_" + str(time.time()) ph = int(block.variables.get(phPW.id, 7)) epikPH = block.variables.get(epikPHPW.id, False) sampleWater = block.variables.get(sampleWaterPW.id, False) diff --git a/EAPM/config/eapm_Local.json b/EAPM/config/eapm_Local.json new file mode 100644 index 0000000..298a495 --- /dev/null +++ b/EAPM/config/eapm_Local.json @@ -0,0 +1,4 @@ +{ + "mafft_path": "MAFFT", + "hmmer_path": "HMMER" +} \ No newline at end of file diff --git a/EAPM/config/eapm_acc.json b/EAPM/config/eapm_acc.json new file mode 100644 index 0000000..298a495 --- /dev/null +++ b/EAPM/config/eapm_acc.json @@ -0,0 +1,4 @@ +{ + "mafft_path": "MAFFT", + "hmmer_path": "HMMER" +} \ No newline at end of file diff --git a/EAPM/config/eapm_nord3_test.json b/EAPM/config/eapm_nord3_test.json new file mode 100644 index 0000000..298a495 --- /dev/null +++ b/EAPM/config/eapm_nord3_test.json @@ -0,0 +1,4 @@ +{ + "mafft_path": "MAFFT", + "hmmer_path": "HMMER" +} \ No newline at end of file