From 662bbff1a37c9cf9ce57f9007c89f89576e740ab Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:34:27 +0800 Subject: [PATCH 01/42] chore: hydra and pip --- .../helixfold/common/all_atom_pdb_save.py | 7 +- .../helixfold/config/helixfold.yaml | 59 ++++ .../infer_scripts/feature_processing_aa.py | 0 .../infer_scripts/preprocess.py | 13 +- .../infer_scripts/tools/mmcif_writer.py | 0 .../helixfold3/{ => helixfold}/inference.py | 289 +++++++----------- .../{ => helixfold}/utils/__init__.py | 0 .../helixfold3/{ => helixfold}/utils/misc.py | 0 .../helixfold3/{ => helixfold}/utils/model.py | 0 .../helixfold3/{ => helixfold}/utils/utils.py | 0 .../protein_folding/helixfold3/pyproject.toml | 48 +++ apps/protein_folding/helixfold3/setup_env.sh | 18 ++ 12 files changed, 248 insertions(+), 186 deletions(-) create mode 100644 apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/feature_processing_aa.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/preprocess.py (97%) rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/tools/mmcif_writer.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/inference.py (64%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/__init__.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/misc.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/model.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/utils.py (100%) create mode 100644 apps/protein_folding/helixfold3/pyproject.toml create mode 100644 apps/protein_folding/helixfold3/setup_env.sh diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index deb8e087..9c9f288e 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -164,10 +164,13 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], - maxit_binary: path to maxit_binary, use to convert pdb to cif - mmcif_path: path to save *.cif """ - assert maxit_binary is not None and os.path.exists(maxit_binary), ( + if os.path.isfile(maxit_binary): + raise FileNotFoundError( f'maxit_binary: {maxit_binary} not exists. ' f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') - assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}' + + if not mmcif_path.endswith('.cif'): + raise ValueError(f'mmcif_path should endswith .cif; got {mmcif_path}') pdb_path = mmcif_path.replace('.cif', '.pdb') pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml new file mode 100644 index 00000000..fd70ada0 --- /dev/null +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ + +# General configuration + +bf16_infer: false # Corresponds to --bf16_infer +seed: null # Corresponds to --seed +logging_level: DEBUG # Corresponds to --logging_level +job_id: 'structure_prediction' # Corresponds to --model_name +weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams # Corresponds to --init_model +precision: fp32 # Corresponds to --precision +amp_level: O1 # Corresponds to --amp_level +infer_times: 1 # Corresponds to --infer_times +diff_batch_size: -1 # Corresponds to --diff_batch_size +use_small_bfd: false # Corresponds to --use_small_bfd + +# File paths + +input: null # Corresponds to --input_json, required field +output: null # Corresponds to --output_dir, required field + + +# Binary tool paths, leave them as null to find proper ones under PATH or conda bin path +bin: + jackhmmer: null # Corresponds to --jackhmmer_binary_path + hhblits: null # Corresponds to --hhblits_binary_path + hhsearch: null # Corresponds to --hhsearch_binary_path + kalign: null # Corresponds to --kalign_binary_path + hmmsearch: null # Corresponds to --hmmsearch_binary_path + hmmbuild: null # Corresponds to --hmmbuild_binary_path + nhmmer: null # Corresponds to --nhmmer_binary_path + obabel: null + +# Database paths +db: + uniprot: /mnt/db/uniprot/uniprot.fasta # Corresponds to --uniprot_database_path, required field + pdb_seqres: /mnt/db/pdb_seqres/pdb_seqres.txt # Corresponds to --pdb_seqres_database_path, required field + uniref90: /mnt/db/uniref90/uniref90.fasta # Corresponds to --uniref90_database_path, required field + mgnify: /mnt/db/mgnify/mgy_clusters.fa # Corresponds to --mgnify_database_path, required field + bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt # Corresponds to --bfd_database_path + small_bfd: null # Corresponds to --small_bfd_database_path + uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02 # Corresponds to --uniclust30_database_path + rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta # Corresponds to --rfam_database_path, required field + ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz # Corresponds to --ccd_preprocessed_path, required field + +# Template and PDB information +template: + mmcif_dir: /mnt/db/pdb_mmcif/mmcif_files # Corresponds to --template_mmcif_dir, required field + max_date: '2023-03-15' # Corresponds to --max_template_date, required field + obsolete_pdbs: /mnt/db/pdb_mmcif/obsolete.dat # Corresponds to --obsolete_pdbs_path, required field + +# Preset configuration +preset: + preset: full_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] + +# Other configurations +other: + maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary + no_msa_templ_feats: false # Corresponds to --no_msa_templ_feats diff --git a/apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py similarity index 100% rename from apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py diff --git a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py similarity index 97% rename from apps/protein_folding/helixfold3/infer_scripts/preprocess.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 41cd44ac..eb8eb14f 100644 --- a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -5,17 +5,17 @@ 'seqs': ccd_seqs, 'msa_seqs': msa_seqs, 'count': count, - 'extra_mol_infos': {}, for which seqs has the modify residue type or smiles. + 'extra_mol_infos': {}, for which seqs has the modify residue type or smiles. """ import collections import copy +import gzip import os import json import sys import subprocess import tempfile import itertools -sys.path.append('../') import rdkit from rdkit import Chem from rdkit.Chem import AllChem @@ -52,9 +52,7 @@ 3: 'Unknown error.' } -OBABEL_BIN = os.getenv('OBABEL_BIN') -if not os.path.exists(OBABEL_BIN): - raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') + def read_json(path): @@ -144,6 +142,11 @@ def smiles_toMol_obabel(smiles): """ generate mol from smiles using obabel; """ + + OBABEL_BIN = os.getenv('OBABEL_BIN') + if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)): + raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') + with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: print(f"[OBABEL] Temporary file created: {temp_file.name}") obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d" diff --git a/apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py similarity index 100% rename from apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py similarity index 64% rename from apps/protein_folding/helixfold3/inference.py rename to apps/protein_folding/helixfold3/helixfold/inference.py index 51cf6ec6..b3fbf745 100644 --- a/apps/protein_folding/helixfold3/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -16,7 +16,6 @@ import re import os import copy -import argparse import random import paddle import json @@ -25,6 +24,11 @@ import shutil import logging import numpy as np +import shutil + +from omegaconf import DictConfig +import hydra + from helixfold.common import all_atom_pdb_save from helixfold.model import config, utils from helixfold.data import pipeline_parallel as pipeline @@ -34,12 +38,14 @@ from helixfold.data.utils import atom_level_keys, map_to_continuous_indices from helixfold.data.tools import hmmsearch from helixfold.data import templates -from utils.utils import get_custom_amp_list -from utils.model import RunModel -from utils.misc import set_logging_level +from helixfold.utils.utils import get_custom_amp_list +from helixfold.utils.model import RunModel +from helixfold.utils.misc import set_logging_level from typing import Dict -from infer_scripts import feature_processing_aa, preprocess -from infer_scripts.tools import mmcif_writer +from helixfold.infer_scripts import feature_processing_aa, preprocess +from helixfold.infer_scripts.tools import mmcif_writer + +script_path=os.path.dirname(__file__) ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP INVERSE_ALLOWED_LIGAND_BONDS_TYPE_MAP = { @@ -105,45 +111,57 @@ def convert_to_json_compatible(obj): return [convert_to_json_compatible(i) for i in obj] else: return obj - -def get_msa_templates_pipeline(args) -> Dict: - use_precomputed_msas = True # FLAGS.use_precomputed_msas + +def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str: + """Helper function to resolve the binary path.""" + if cfg_path and os.path.isfile(cfg_path): + return cfg_path + + if cfg_val:=shutil.which(default_binary_name): + logging.warning(f'Using resolved {default_binary_name}: {cfg_val}') + return cfg_val + + raise FileNotFoundError(f"Could not find a proper binary path for {default_binary_name}: {cfg_path}.") + +def get_msa_templates_pipeline(cfg: DictConfig) -> Dict: + use_precomputed_msas = True # Assuming this is a constant or should be set globally + template_searcher = hmmsearch.Hmmsearch( - binary_path=args.hmmsearch_binary_path, - hmmbuild_binary_path=args.hmmbuild_binary_path, - database_path=args.pdb_seqres_database_path) + binary_path=resolve_bin_path(cfg.bin.hmmsearch, 'hmmsearch'), + hmmbuild_binary_path=resolve_bin_path(cfg.bin.hmmbuild, 'hmmbuild'), + database_path=cfg.db.pdb_seqres) template_featurizer = templates.HmmsearchHitFeaturizer( - mmcif_dir=args.template_mmcif_dir, - max_template_date=args.max_template_date, + mmcif_dir=cfg.template.mmcif_dir, + max_template_date=cfg.template.max_date, max_hits=MAX_TEMPLATE_HITS, - kalign_binary_path=args.kalign_binary_path, + kalign_binary_path=resolve_bin_path(cfg.bin.kalign, 'kalign'), release_dates_path=None, - obsolete_pdbs_path=args.obsolete_pdbs_path) + obsolete_pdbs_path=cfg.template.obsolete_pdbs) monomer_data_pipeline = pipeline.DataPipeline( - jackhmmer_binary_path=args.jackhmmer_binary_path, - hhblits_binary_path=args.hhblits_binary_path, - hhsearch_binary_path=args.hhsearch_binary_path, - uniref90_database_path=args.uniref90_database_path, - mgnify_database_path=args.mgnify_database_path, - bfd_database_path=args.bfd_database_path, - uniclust30_database_path=args.uniclust30_database_path, - small_bfd_database_path=args.small_bfd_database_path , + jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'), + hhblits_binary_path=resolve_bin_path(cfg.bin.hhblits, 'hhblits'), + hhsearch_binary_path=resolve_bin_path(cfg.bin.hhsearch, 'hhsearch'), + uniref90_database_path=cfg.db.uniref90, + mgnify_database_path=cfg.db.mgnify, + bfd_database_path=cfg.db.bfd, + uniclust30_database_path=cfg.db.uniclust30, + small_bfd_database_path=cfg.db.small_bfd, template_searcher=template_searcher, template_featurizer=template_featurizer, - use_small_bfd=args.use_small_bfd, + use_small_bfd=cfg.use_small_bfd, use_precomputed_msas=use_precomputed_msas) prot_data_pipeline = pipeline_multimer.DataPipeline( monomer_data_pipeline=monomer_data_pipeline, - jackhmmer_binary_path=args.jackhmmer_binary_path, - uniprot_database_path=args.uniprot_database_path, + jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'), + uniprot_database_path=cfg.db.uniprot, use_precomputed_msas=use_precomputed_msas) rna_monomer_data_pipeline = pipeline_rna.RNADataPipeline( - hmmer_binary_path=args.nhmmer_binary_path, - rfam_database_path=args.rfam_database_path, + hmmer_binary_path=resolve_bin_path(cfg.bin.nhmmer, 'nhmmer'), + rfam_database_path=cfg.db.rfam, rnacentral_database_path=None, nt_database_path=None, species_identifer_map_path=None, @@ -156,7 +174,6 @@ def get_msa_templates_pipeline(args) -> Dict: 'protein': prot_data_pipeline, 'rna': rna_data_pipeline } - def ranking_all_predictions(output_dirs): ranking_score_path_map = {} for outpath in output_dirs: @@ -176,27 +193,29 @@ def ranking_all_predictions(output_dirs): rank_id += 1 @paddle.no_grad() -def eval(args, model, batch): - """evaluate a given dataset""" +def eval(cfg: DictConfig, model:RunModel, batch): + """Evaluate a given dataset""" model.eval() - # inference + # Inference def _forward_with_precision(batch): - if args.precision == "bf16" or args.bf16_infer: + precision=cfg.precision + if precision not in ('bf16','fp32',): + raise ValueError("Please choose precision from bf16 and fp32!") + + if cfg.precision == "bf16" or cfg.bf16_infer: black_list, white_list = get_custom_amp_list() with paddle.amp.auto_cast(enable=True, - custom_white_list=white_list, - custom_black_list=black_list, - level=args.amp_level, - dtype='bfloat16'): + custom_white_list=white_list, + custom_black_list=black_list, + level=cfg.amp_level, + dtype='bfloat16'): return model(batch, compute_loss=False) - elif args.precision == "fp32": - return model(batch, compute_loss=False) - else: - raise ValueError("Please choose precision from bf16 and fp32! ") + + return model(batch, compute_loss=False) res = _forward_with_precision(batch) - logger.info(f"Inference Succeeds...\n") + logger.info("Inference Succeeds...\n") return res @@ -430,52 +449,55 @@ def split_prediction(pred, rank): return prediction -def main(args): - set_logging_level(args.logging_level) +@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold') +def main(cfg: DictConfig): + set_logging_level(cfg.logging_level) """main function""" new_einsum = os.getenv("FLAGS_new_einsum", True) print(f'>>> PaddlePaddle commit: {paddle.version.commit}') print(f'>>> FLAGS_new_einsum: {new_einsum}') - print(f'>>> args:\n{args}') + print(f'>>> config:\n{cfg}') - all_entitys = preprocess_json_entity(args.input_json, args.output_dir) + all_entitys = preprocess_json_entity(cfg.input, cfg.output) ## check maxit binary path - if args.maxit_binary is not None: - assert os.path.exists(args.maxit_binary), \ - f"The maxit binary path {args.maxit_binary} does not exists." + maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit') + + RCSBROOT=os.path.dirname(maxit_binary) + os.environ['RCSBROOT']=RCSBROOT + ## check obabel + obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel') + os.environ['OBABEL_BIN']=obabel_bin - ### set seed for reproduce experiment results - seed = args.seed + ### Set seed for reproducibility + seed = cfg.seed if seed is None: seed = np.random.randint(10000000) else: - logger.warning('seed is only used for reproduction') + logger.warning('Seed is only used for reproduction') init_seed(seed) - - use_small_bfd = args.preset == 'reduced_dbs' - setattr(args, 'use_small_bfd', use_small_bfd) + use_small_bfd = cfg.preset.preset == 'reduced_dbs' + setattr(cfg, 'use_small_bfd', use_small_bfd) if use_small_bfd: - assert args.small_bfd_database_path is not None + assert cfg.db.small_bfd is not None else: - assert args.bfd_database_path is not None - assert args.uniclust30_database_path is not None + assert cfg.db.bfd is not None + assert cfg.db.uniclust30 is not None logger.info('Getting MSA/Template Pipelines...') msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args) - - ### create model - model_config = config.model_config(args.model_name) - print(f'>>> model_config:\n{model_config}') + ### Create model + model_config = config.model_config(cfg.job_id) + #print(f'>>> model_config:\n{model_config}') model = RunModel(model_config) - if (not args.init_model is None) and (not args.init_model == ""): - print(f"Load pretrain model from {args.init_model}") - pd_params = paddle.load(args.init_model) + if (not cfg.weight_path is None) and (cfg.weight_path != ""): + print(f"Load pretrain model from {cfg.weight_path}") + pd_params = paddle.load(cfg.weight_path) has_opt = 'optimizer' in pd_params if has_opt: @@ -483,42 +505,46 @@ def main(args): else: model.helixfold.set_state_dict(pd_params) - if args.precision == "bf16" and args.amp_level == "O2": + if cfg.precision == "bf16" and cfg.amp_level == "O2": raise NotImplementedError("bf16 O2 is not supported yet.") print(f"============ Data Loading ============") - job_base = pathlib.Path(args.input_json).stem - output_dir_base = pathlib.Path(args.output_dir).joinpath(job_base) + job_base = pathlib.Path(cfg.input).stem + output_dir_base = pathlib.Path(cfg.output).joinpath(job_base) msa_output_dir = output_dir_base.joinpath('msas') msa_output_dir.mkdir(parents=True, exist_ok=True) features_pkl = output_dir_base.joinpath('final_features.pkl') - feature_dict = feature_processing_aa.process_input_json( - all_entitys, - ccd_preprocessed_path=args.ccd_preprocessed_path, - msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict, - msa_output_dir=msa_output_dir) + if features_pkl.exists(): + with open(features_pkl, 'rb') as f: + feature_dict = pickle.load(f) + else: + feature_dict = feature_processing_aa.process_input_json( + all_entitys, + ccd_preprocessed_path=cfg.db.ccd_preprocessed, + msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict, + msa_output_dir=msa_output_dir) - # save features - with open(features_pkl, 'wb') as f: - pickle.dump(feature_dict, f, protocol=4) + # save features + with open(features_pkl, 'wb') as f: + pickle.dump(feature_dict, f, protocol=4) feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True) feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True) print(f"============ Start Inference ============") - infer_times = args.infer_times - if args.diff_batch_size > 0: - model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size + infer_times = cfg.infer_times + if cfg.diff_batch_size > 0: + model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size logger.info(f'Inference {infer_times} Times...') - logger.info(f" diffusion batch size {diff_batch_size}...\n") + logger.info(f"Diffusion batch size {diff_batch_size}...\n") all_pred_path = [] for infer_id in range(infer_times): logger.info(f'Start {infer_id}-th inference...\n') - prediction = eval(args, model, feature_dict) + prediction = eval(cfg, model, feature_dict) # save result prediction = split_prediction(prediction, diff_batch_size) @@ -530,7 +556,7 @@ def main(args): feature_dict=feature_dict, prediction=prediction[rank_id], output_dir=output_dir, - maxit_bin=args.maxit_binary) + maxit_bin=cfg.other.maxit_binary) all_pred_path.append(output_dir) # final ranking @@ -538,100 +564,5 @@ def main(args): ranking_all_predictions(all_pred_path) print(f'============ Inference finished ! ============') - if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--bf16_infer", action='store_true', default=False) - parser.add_argument("--seed", type=int, default=None, help="set seed for reproduce experiment results, None is do not set seed") - parser.add_argument("--logging_level", type=str, default="DEBUG", help="NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL") - parser.add_argument("--model_name", type=str, help='used to choose model config') - parser.add_argument("--init_model", type=str, default='') - parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32') - parser.add_argument("--amp_level", type=str, default='O1') - parser.add_argument("--infer_times", type=int, default=1) - parser.add_argument("--diff_batch_size", type=int, default=-1) - parser.add_argument('--input_json', type=str, - default=None, required=True, - help='Paths to json file, each containing ' - 'entity information including sequence, smiles or CCD, copies etc.') - parser.add_argument('--output_dir', type=str, - default=None, required=True, - help='Path to a directory that will store results.') - parser.add_argument('--ccd_preprocessed_path', type=str, - default=None, required=True, - help='Path to CCD preprocessed files.') - parser.add_argument('--jackhmmer_binary_path', type=str, - default='/usr/bin/jackhmmer', - help='Path to the JackHMMER executable.') - parser.add_argument('--hhblits_binary_path', type=str, - default='/usr/bin/hhblits', - help='Path to the HHblits executable.') - parser.add_argument('--hhsearch_binary_path', type=str, - default='/usr/bin/hhsearch', - help='Path to the HHsearch executable.') - parser.add_argument('--kalign_binary_path', type=str, - default='/usr/bin/kalign', - help='Path to the Kalign executable.') - parser.add_argument('--hmmsearch_binary_path', type=str, - default='/usr/bin/hmmsearch', - help='Path to the hmmsearch executable.') - parser.add_argument('--hmmbuild_binary_path', type=str, - default='/usr/bin/hmmbuild', - help='Path to the hmmbuild executable.') - - # binary path of the tool for RNA MSA searching - parser.add_argument('--nhmmer_binary_path', type=str, - default='/usr/bin/nhmmer', - help='Path to the nhmmer executable.') - - parser.add_argument('--uniprot_database_path', type=str, - default=None, required=True, - help='Path to the Uniprot database for use ' - 'by JackHMMER.') - parser.add_argument('--pdb_seqres_database_path', type=str, - default=None, required=True, - help='Path to the PDB ' - 'seqres database for use by hmmsearch.') - parser.add_argument('--uniref90_database_path', type=str, - default=None, required=True, - help='Path to the Uniref90 database for use ' - 'by JackHMMER.') - parser.add_argument('--mgnify_database_path', type=str, - default=None, required=True, - help='Path to the MGnify database for use by ' - 'JackHMMER.') - parser.add_argument('--bfd_database_path', type=str, default=None, - help='Path to the BFD database for use by HHblits.') - parser.add_argument('--small_bfd_database_path', type=str, default=None, - help='Path to the small version of BFD used ' - 'with the "reduced_dbs" preset.') - parser.add_argument('--uniclust30_database_path', type=str, default=None, - help='Path to the Uniclust30 database for use ' - 'by HHblits.') - # RNA MSA searching databases - parser.add_argument('--rfam_database_path', type=str, - default=None, required=True, - help='Path to the Rfam database for RNA MSA searching.') - parser.add_argument('--template_mmcif_dir', type=str, - default=None, required=True, - help='Path to a directory with template mmCIF ' - 'structures, each named .cif') - parser.add_argument('--max_template_date', type=str, - default=None, required=True, - help='Maximum template release date to consider. ' - 'Important if folding historical test sets.') - parser.add_argument('--obsolete_pdbs_path', type=str, - default=None, required=True, - help='Path to file containing a mapping from ' - 'obsolete PDB IDs to the PDB IDs of their ' - 'replacements.') - parser.add_argument('--preset', - default='full_dbs', required=False, - choices=['reduced_dbs', 'full_dbs'], - help='Choose preset model configuration - ' - 'no ensembling and smaller genetic database ' - 'config (reduced_dbs), no ensembling and full ' - 'genetic database config (full_dbs)') - parser.add_argument('--maxit_binary', type=str, default=None) - args = parser.parse_args() - main(args) + main() diff --git a/apps/protein_folding/helixfold3/utils/__init__.py b/apps/protein_folding/helixfold3/helixfold/utils/__init__.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/__init__.py rename to apps/protein_folding/helixfold3/helixfold/utils/__init__.py diff --git a/apps/protein_folding/helixfold3/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/misc.py rename to apps/protein_folding/helixfold3/helixfold/utils/misc.py diff --git a/apps/protein_folding/helixfold3/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/model.py rename to apps/protein_folding/helixfold3/helixfold/utils/model.py diff --git a/apps/protein_folding/helixfold3/utils/utils.py b/apps/protein_folding/helixfold3/helixfold/utils/utils.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/utils.py rename to apps/protein_folding/helixfold3/helixfold/utils/utils.py diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml new file mode 100644 index 00000000..bc988ca9 --- /dev/null +++ b/apps/protein_folding/helixfold3/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["poetry-core>=1.0.0,<2.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "helixfold" +version = "3.0.0" +description = "Code for helixfold v3" +authors = ["Name "] + +readme = "README.md" +license = "MIT" +repository = "https://github.com/PaddlePaddle/PaddleHelix/blob/dev/apps/protein_folding/helixfold3" +classifiers = [ + "Topic :: Scientific/Engineering :: Biochemistry", + "Topic :: Scientific/Engineering :: Protein Engineering" +] + + +packages = [ + { include = "helixfold" }, + { include = "helixfold/*.py" }, +] + + +[tool.poetry.dependencies] +python = "^3.8" + +absl-py = "0.13.0" +biopython = "1.79" +chex = "0.0.7" +dm-haiku = "0.0.4" +dm-tree = "0.1.6" +docker = "5.0.0" +immutabledict = "2.0.0" +jax = "0.2.14" +ml-collections = "0.1.0" +pandas = "1.3.4" +scipy = "1.9.0" +rdkit-pypi = "2022.9.5" +posebusters = "*" +hydra-core= "^1.3.2" +omegaconf = "^2.3.0" + + + +[tool.poetry.scripts] +helixfold = 'helixfold.inference:main' diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh new file mode 100644 index 00000000..30f008d6 --- /dev/null +++ b/apps/protein_folding/helixfold3/setup_env.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +ENV_NAME='helixfold' +CUDA=12.0 + +# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit + +# Install py env +conda create -n ${ENV_NAME} -y -c conda-forge pip python=3.9; +source activate ${ENV_NAME} +conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia + +conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 +conda install -y -c conda-forge openbabel + +python -m pip install --upgrade 'pip<24';pip install . --no-cache-dir + +pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl From b4fce8af392b03b030610d810d878d65fee5b79b Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 10:52:13 +0800 Subject: [PATCH 02/42] use reduced bfd --- .../helixfold3/helixfold/config/helixfold.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index fd70ada0..60e8076d 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -29,7 +29,7 @@ bin: hmmsearch: null # Corresponds to --hmmsearch_binary_path hmmbuild: null # Corresponds to --hmmbuild_binary_path nhmmer: null # Corresponds to --nhmmer_binary_path - obabel: null + obabel: null # Inject to env as OBABEL_BIN # Database paths db: @@ -38,7 +38,7 @@ db: uniref90: /mnt/db/uniref90/uniref90.fasta # Corresponds to --uniref90_database_path, required field mgnify: /mnt/db/mgnify/mgy_clusters.fa # Corresponds to --mgnify_database_path, required field bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt # Corresponds to --bfd_database_path - small_bfd: null # Corresponds to --small_bfd_database_path + small_bfd: /mnt/db/reduced_bfd/bfd-first_non_consensus_sequences.fasta # Corresponds to --small_bfd_database_path uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02 # Corresponds to --uniclust30_database_path rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta # Corresponds to --rfam_database_path, required field ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz # Corresponds to --ccd_preprocessed_path, required field @@ -51,7 +51,7 @@ template: # Preset configuration preset: - preset: full_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] + preset: reduced_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] # Other configurations other: From d2aa1b9fdcd35e48955c87f51c33bfa221a895d2 Mon Sep 17 00:00:00 2001 From: Ryan Garcia <47666442+RyanGarciaLI@users.noreply.github.com> Date: Thu, 15 Aug 2024 21:20:39 +0800 Subject: [PATCH 03/42] disable no MSA mode (#312) From d52830086c4200a77abcaa11ee927475420f6f2c Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 11:37:57 +0800 Subject: [PATCH 04/42] add override flag --- .../helixfold3/helixfold/config/helixfold.yaml | 2 +- apps/protein_folding/helixfold3/helixfold/inference.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 60e8076d..3428b324 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -18,6 +18,7 @@ use_small_bfd: false # Corresponds to --use_small_bfd input: null # Corresponds to --input_json, required field output: null # Corresponds to --output_dir, required field +override: false # Set true to override existing msa output directory # Binary tool paths, leave them as null to find proper ones under PATH or conda bin path @@ -56,4 +57,3 @@ preset: # Other configurations other: maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary - no_msa_templ_feats: false # Corresponds to --no_msa_templ_feats diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index b3fbf745..0105a1e3 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -487,7 +487,7 @@ def main(cfg: DictConfig): assert cfg.db.uniclust30 is not None logger.info('Getting MSA/Template Pipelines...') - msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args) + msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg) ### Create model model_config = config.model_config(cfg.job_id) @@ -515,8 +515,9 @@ def main(cfg: DictConfig): msa_output_dir.mkdir(parents=True, exist_ok=True) features_pkl = output_dir_base.joinpath('final_features.pkl') - if features_pkl.exists(): + if features_pkl.exists() and not cfg.override: with open(features_pkl, 'rb') as f: + logging.info(f'Load features from precomputed {features_pkl}') feature_dict = pickle.load(f) else: feature_dict = feature_processing_aa.process_input_json( From a3d62e2e6b0ae6ba7cd141f56ab9d086bec897dd Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 15:09:33 +0800 Subject: [PATCH 05/42] add configs --- .../helixfold/config/helixfold.yaml | 12 ++++++++ .../helixfold3/helixfold/model/config.py | 29 ++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 3428b324..12ae893e 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -57,3 +57,15 @@ preset: # Other configurations other: maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary + + +# CONFIG_DIFFS for advanced configuration +CONFIG_DIFFS: + preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1'] + model: + global_config: + subbatch_size: 96 # model.global_config.subbatch_size + num_recycle: 3 # model.num_recycle + heads: + confidence_head: + weight: 0.0 # model.heads.confidence_head.weight diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py index 6da8566a..2eafaaa6 100644 --- a/apps/protein_folding/helixfold3/helixfold/model/config.py +++ b/apps/protein_folding/helixfold3/helixfold/model/config.py @@ -15,7 +15,9 @@ """Model config.""" import copy +from typing import Any, Union import ml_collections +from omegaconf import DictConfig NUM_RES = 'num residues placeholder' @@ -24,16 +26,35 @@ NUM_TEMPLATES = 'num templates placeholder' -def model_config(name: str) -> ml_collections.ConfigDict: +def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> ml_collections.ConfigDict: """Get the ConfigDict of a model.""" cfg = copy.deepcopy(CONFIG_ALLATOM) - if name in CONFIG_DIFFS: - cfg.update_from_flattened_dict(CONFIG_DIFFS[name]) + if config_diffs is None or config_diffs=='': + # early return if nothing is changed + return cfg - return cfg + if isinstance(config_diffs, DictConfig): + if 'preset' in config_diffs and (preset_name:=config_diffs['preset']) in CONFIG_DIFFS: + updated_config=CONFIG_DIFFS[preset_name] + cfg.update_from_flattened_dict(updated_config) + print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}') + return cfg + if 'model' in config_diffs and (updated_config := config_diffs['model']) is not None: + cfg.update(dict(updated_config)) + print(f'Updated config from `CONFIG_DIFFS`: {updated_config}') + return cfg + + raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}') + + + + + + +# preset for runs CONFIG_DIFFS = { 'allatom_demo': { 'model.heads.confidence_head.weight': 0.01 From 6e39caa25802b72cc8d11d111a85094e8dc57cd7 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 15:29:02 +0800 Subject: [PATCH 06/42] fix:config --- .../helixfold3/helixfold/config/helixfold.yaml | 8 ++++---- .../helixfold3/helixfold/inference.py | 3 +-- .../helixfold3/helixfold/model/config.py | 15 +++++++-------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 12ae893e..561108fa 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -65,7 +65,7 @@ CONFIG_DIFFS: model: global_config: subbatch_size: 96 # model.global_config.subbatch_size - num_recycle: 3 # model.num_recycle - heads: - confidence_head: - weight: 0.0 # model.heads.confidence_head.weight + num_recycle: 3 # model.num_recycle + heads: + confidence_head: + weight: 0.0 # model.heads.confidence_head.weight diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 0105a1e3..5ca95e3b 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -490,8 +490,7 @@ def main(cfg: DictConfig): msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg) ### Create model - model_config = config.model_config(cfg.job_id) - #print(f'>>> model_config:\n{model_config}') + model_config = config.model_config(cfg.CONFIG_DIFFS) model = RunModel(model_config) diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py index 2eafaaa6..c4682b29 100644 --- a/apps/protein_folding/helixfold3/helixfold/model/config.py +++ b/apps/protein_folding/helixfold3/helixfold/model/config.py @@ -41,18 +41,17 @@ def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]] print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}') return cfg - if 'model' in config_diffs and (updated_config := config_diffs['model']) is not None: - cfg.update(dict(updated_config)) - print(f'Updated config from `CONFIG_DIFFS`: {updated_config}') - + + # update from detailed configuration + if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM): + for root_kw in CONFIG_ALLATOM: + if root_kw in config_diffs and (updated_config := config_diffs[root_kw]) is not None: + cfg.update(dict(updated_config)) + print(f'Updated config from `CONFIG_DIFFS`: {updated_config}') return cfg raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}') - - - - # preset for runs CONFIG_DIFFS = { From 29ef9c49241b3badd5851f93623ad83b11666bda Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 16:05:58 +0800 Subject: [PATCH 07/42] fix: maxit path --- .../helixfold/common/all_atom_pdb_save.py | 2 +- .../helixfold/config/helixfold.yaml | 1 - .../helixfold3/helixfold/inference.py | 28 +++++++++---------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index 9c9f288e..8d541bd7 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -164,7 +164,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], - maxit_binary: path to maxit_binary, use to convert pdb to cif - mmcif_path: path to save *.cif """ - if os.path.isfile(maxit_binary): + if not os.path.isfile(maxit_binary): raise FileNotFoundError( f'maxit_binary: {maxit_binary} not exists. ' f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 561108fa..623a3561 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -6,7 +6,6 @@ defaults: bf16_infer: false # Corresponds to --bf16_infer seed: null # Corresponds to --seed logging_level: DEBUG # Corresponds to --logging_level -job_id: 'structure_prediction' # Corresponds to --model_name weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams # Corresponds to --init_model precision: fp32 # Corresponds to --precision amp_level: O1 # Corresponds to --amp_level diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 5ca95e3b..97ae415c 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -193,29 +193,27 @@ def ranking_all_predictions(output_dirs): rank_id += 1 @paddle.no_grad() -def eval(cfg: DictConfig, model:RunModel, batch): - """Evaluate a given dataset""" +def eval(args, model, batch): + """evaluate a given dataset""" model.eval() - # Inference + # inference def _forward_with_precision(batch): - precision=cfg.precision - if precision not in ('bf16','fp32',): - raise ValueError("Please choose precision from bf16 and fp32!") - - if cfg.precision == "bf16" or cfg.bf16_infer: + if args.precision == "bf16" or args.bf16_infer: black_list, white_list = get_custom_amp_list() with paddle.amp.auto_cast(enable=True, - custom_white_list=white_list, - custom_black_list=black_list, - level=cfg.amp_level, - dtype='bfloat16'): + custom_white_list=white_list, + custom_black_list=black_list, + level=args.amp_level, + dtype='bfloat16'): return model(batch, compute_loss=False) - - return model(batch, compute_loss=False) + elif args.precision == "fp32": + return model(batch, compute_loss=False) + else: + raise ValueError("Please choose precision from bf16 and fp32! ") res = _forward_with_precision(batch) - logger.info("Inference Succeeds...\n") + logger.info(f"Inference Succeeds...\n") return res From 6ef4cd89be76ac48f4060569501d408f57bf530d Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 18:26:57 +0800 Subject: [PATCH 08/42] fix: maxit run with env --- .../helixfold/common/all_atom_pdb_save.py | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index 8d541bd7..f15f956d 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -21,6 +21,7 @@ import paddle import itertools import os +import subprocess FeatureDict = Mapping[str, np.ndarray] ModelOutput = Mapping[str, Any] # Is a nested dict. @@ -174,7 +175,27 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], pdb_path = mmcif_path.replace('.cif', '.pdb') pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path) - msg = os.system(f'{maxit_binary} -i {pdb_path} -o 1 -output {mmcif_path}') - if msg != 0: - print(f'convert pdb to cif failed, error message: {msg}') + + cmd=[maxit_binary, + '-i', pdb_path, + '-o', '1', + '-output', mmcif_path, + ] + + print('Launching subprocess "%s"', ' '.join(cmd)) + + process = subprocess.Popen( + ' '.join(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) + + + stdout, stderr = process.communicate() + retcode = process.wait() + + + if retcode: + # Logs have a 15k character limit, so log HHblits error line by line. + print('maxit failed. HHblits stderr begin:') + raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % ( + stdout.decode('utf-8'), stderr[:500_000].decode('utf-8'))) + return mmcif_path \ No newline at end of file From 5568284d92808865e27f5fe04632dff4f9ce4cab Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 18:29:16 +0800 Subject: [PATCH 09/42] fix: maxit run with env --- .../helixfold3/helixfold/common/all_atom_pdb_save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index f15f956d..92e7d225 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -185,7 +185,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], print('Launching subprocess "%s"', ' '.join(cmd)) process = subprocess.Popen( - ' '.join(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) stdout, stderr = process.communicate() From ae9146a5b01fc352999b893fd9ee8a6de5ab9c3c Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 18:43:53 +0800 Subject: [PATCH 10/42] copy of license --- apps/protein_folding/helixfold3/FOLDING.md | 7 ++++ .../helixfold3/helixfold/LICENSE | 36 +++++++++++++++++++ .../helixfold/common/all_atom_pdb_save.py | 12 +++++-- .../helixfold3/helixfold/inference.py | 2 +- 4 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 apps/protein_folding/helixfold3/FOLDING.md create mode 100644 apps/protein_folding/helixfold3/helixfold/LICENSE diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md new file mode 100644 index 00000000..17dc6ebc --- /dev/null +++ b/apps/protein_folding/helixfold3/FOLDING.md @@ -0,0 +1,7 @@ +# FOLD GUIDE + +## Quick example +```shell +LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json output=. CONFIG_DIFFS.preset=allatom_demo + +``` \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/LICENSE b/apps/protein_folding/helixfold3/helixfold/LICENSE new file mode 100644 index 00000000..fb84eeb3 --- /dev/null +++ b/apps/protein_folding/helixfold3/helixfold/LICENSE @@ -0,0 +1,36 @@ +# Terms of Use Agreement +The HelixFold 3 open-source code and any derivative works are only available for non-commercial use by individuals and non-commercial organizations (universities, non-profit organizations and research institutes, educational and government bodies), or for journalism. + + +# Usage Restrictions: +The use of the HelixFold 3 open-source code is subject to the following conditions and restrictions: + 1. Commercial Use Prohibited: The code and its outputs shall not be used by or for commercial entities, nor in any context related to commercial activities, including conducting research for commercial purposes or conferring any rights to commercial entities to use the outputs. + + 2. Integration with Automated Systems: The open-source code shall not be incorporated into any unauthorized automated systems, including, but not limited to, workflows for the screening or design of biomolecules. + + 3. Attribution Requirement: In any academic publication derived from the use of the open-source code, proper attribution to the original authors must be given in the manner specified, including the retention of the author's name, copyright notice, and a link to the license agreement, as well as a statement indicating whether modifications were made to the materials. + + 4. Share-Alike Obligation: If you modify or adapt the materials (e.g., through translation or rewriting), you must distribute your derivative works under the same "Attribution-NonCommercial-ShareAlike" license. This ensures that others may use your adapted work under identical terms. + + 5. Prohibition on Illegal or Malicious Use: The code shall not be used to facilitate or engage in activities that are illegal, dangerous, or malicious. + +For commercial use, you must contact the authors to obtain a commercial license.\ +**Disclaimers:** + +The HelixFold 3 open-source code any derivative works are only for theoretical modelling. These are not intended, validated, or approved for clinical use. + +**Governing Law and Dispute Resolution:** \ +This agreement is governed by the laws of the People's Republic of China. In the event of a dispute arising from the execution of this agreement, the parties shall attempt to resolve it through amicable consultation. If consultation fails, either party may submit the dispute to the People's Court of Haidian District, Beijing for adjudication. + +**Termination:** + - **Automatic Termination:** The rights granted under this license will automatically terminate if you violate any of its terms. Rights may be reinstated upon rectification of the violation or with the licensor's express consent. + + - **Surviving Provisions:** Certain provisions, including disclaimers and liability limitations, shall remain in effect even after the termination of this license. + + + +**Miscellaneous Provisions:** + - **No Additional Restrictions:** You may not impose any additional restrictions on the use of the materials, such as implementing technical protection measures (e.g., encryption), as doing so would contravene the spirit of this license. + - **Interpretation Authority:** Baidu reserves the right to make reasonable interpretations of this open-source service. + +Should any provision of this agreement be deemed invalid or unenforceable, the remaining provisions shall continue to be in full force and effect. diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index 92e7d225..abc33ef6 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -184,6 +184,10 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], print('Launching subprocess "%s"', ' '.join(cmd)) + if os.path.exists('maxit.log'): + os.remove('maxit.log') + + process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) @@ -194,8 +198,10 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], if retcode: # Logs have a 15k character limit, so log HHblits error line by line. - print('maxit failed. HHblits stderr begin:') - raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % ( - stdout.decode('utf-8'), stderr[:500_000].decode('utf-8'))) + print('Maxit failed. Maxit stderr begin:') + raise RuntimeError(f'Maxit failed\nstdout:\n{stdout.decode("utf-8")}\n\n' + f'stderr:\n{stderr[:500_000].decode("utf-8")}\n' + f'logfile:\n{open("maxit.log", "r").read().strip() if os.path.isfile("maxit.log") else ""}\n' + f'Env:\n{os.environ.copy()}') return mmcif_path \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 97ae415c..5a883094 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -461,7 +461,7 @@ def main(cfg: DictConfig): ## check maxit binary path maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit') - RCSBROOT=os.path.dirname(maxit_binary) + RCSBROOT=os.path.join(os.path.dirname(maxit_binary), '..') os.environ['RCSBROOT']=RCSBROOT ## check obabel From 5aae769b0dd7b5064db7c436ec18b1ecd0e7badf Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 19:05:59 +0800 Subject: [PATCH 11/42] fix: obabel bin resolve --- apps/protein_folding/helixfold3/FOLDING.md | 5 +++++ apps/protein_folding/helixfold3/helixfold/inference.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md index 17dc6ebc..80170f97 100644 --- a/apps/protein_folding/helixfold3/FOLDING.md +++ b/apps/protein_folding/helixfold3/FOLDING.md @@ -1,7 +1,12 @@ # FOLD GUIDE ## Quick example +Run from default config ```shell LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json output=. CONFIG_DIFFS.preset=allatom_demo +``` +Run with customized configuration dir and name: +```shell +LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json output=. CONFIG_DIFFS.preset=allatom_demo ``` \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 5a883094..d26240cf 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -457,7 +457,6 @@ def main(cfg: DictConfig): print(f'>>> FLAGS_new_einsum: {new_einsum}') print(f'>>> config:\n{cfg}') - all_entitys = preprocess_json_entity(cfg.input, cfg.output) ## check maxit binary path maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit') @@ -468,6 +467,8 @@ def main(cfg: DictConfig): obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel') os.environ['OBABEL_BIN']=obabel_bin + all_entitys = preprocess_json_entity(cfg.input, cfg.output) + ### Set seed for reproducibility seed = cfg.seed if seed is None: From b63e7220799c38a4d6b79920f606ad7e73fec2bf Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 19:07:15 +0800 Subject: [PATCH 12/42] doc: use conda prefix --- apps/protein_folding/helixfold3/FOLDING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md index 80170f97..b658f016 100644 --- a/apps/protein_folding/helixfold3/FOLDING.md +++ b/apps/protein_folding/helixfold3/FOLDING.md @@ -3,10 +3,10 @@ ## Quick example Run from default config ```shell -LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json output=. CONFIG_DIFFS.preset=allatom_demo +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json output=. CONFIG_DIFFS.preset=allatom_demo ``` Run with customized configuration dir and name: ```shell -LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json output=. CONFIG_DIFFS.preset=allatom_demo +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json output=. CONFIG_DIFFS.preset=allatom_demo ``` \ No newline at end of file From 0863ecfe298e5a44419e6e96c4bec8cab3b4c240 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 21:48:01 +0800 Subject: [PATCH 13/42] doc: merge to readme --- apps/protein_folding/helixfold3/FOLDING.md | 12 --- apps/protein_folding/helixfold3/README.md | 88 +++++++------------ .../helixfold/config/helixfold.yaml | 1 + .../helixfold3/helixfold/inference.py | 3 +- .../helixfold3/requirements.txt | 13 --- apps/protein_folding/helixfold3/run_infer.sh | 39 -------- apps/protein_folding/helixfold3/setup_env.sh | 18 ---- 7 files changed, 37 insertions(+), 137 deletions(-) delete mode 100644 apps/protein_folding/helixfold3/FOLDING.md delete mode 100644 apps/protein_folding/helixfold3/requirements.txt delete mode 100644 apps/protein_folding/helixfold3/run_infer.sh delete mode 100644 apps/protein_folding/helixfold3/setup_env.sh diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md deleted file mode 100644 index b658f016..00000000 --- a/apps/protein_folding/helixfold3/FOLDING.md +++ /dev/null @@ -1,12 +0,0 @@ -# FOLD GUIDE - -## Quick example -Run from default config -```shell -LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json output=. CONFIG_DIFFS.preset=allatom_demo -``` - -Run with customized configuration dir and name: -```shell -LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json output=. CONFIG_DIFFS.preset=allatom_demo -``` \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index 12ed7041..bdf4264e 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -44,17 +44,26 @@ Locate to the directory of `helixfold` then run: ```bash # Install py env conda create -n helixfold -c conda-forge python=3.9 -conda install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -n helixfold -conda install -y -c conda-forge openbabel -n helixfold # activate the conda environment conda activate helixfold +# adjust these version numbers as your situation +conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia +conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 +conda install -y -c conda-forge openbabel + # install paddlepaddle -python3 -m pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html # or lower version: https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl -python3 -m pip install -r requirements.txt +# downgrade pip +pip install --upgrade 'pip<24' + +# edit configuration file at `/helixfold/config/helixfold.yaml` to set your databases and binaries correctly + +# install HF3 as a python library +pip install . --no-cache-dir ``` Note: If you have a different version of python3 and cuda, please refer to [here](https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html) for the compatible PaddlePaddle `dev` package. @@ -125,58 +134,29 @@ sh run_infer.sh ``` The script is as follows, -```bash -#!/bin/bash - -PYTHON_BIN="PATH/TO/YOUR/PYTHON" -ENV_BIN="PATH/TO/YOUR/ENV" -MAXIT_SRC="PATH/TO/MAXIT/SRC" -DATA_DIR="PATH/TO/DATA" -export OBABEL_BIN="PATH/TO/OBABEL/BIN" -export PATH="$MAXIT_BIN/bin:$PATH" - -CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ - --maxit_binary "$MAXIT_SRC/bin/maxit" \ - --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \ - --hhblits_binary_path "$ENV_BIN/hhblits" \ - --hhsearch_binary_path "$ENV_BIN/hhsearch" \ - --kalign_binary_path "$ENV_BIN/kalign" \ - --hmmsearch_binary_path "$ENV_BIN/hmmsearch" \ - --hmmbuild_binary_path "$ENV_BIN/hmmbuild" \ - --nhmmer_binary_path "$ENV_BIN/nhmmer" \ - --preset='reduced_dbs' \ - --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \ - --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \ - --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \ - --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \ - --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \ - --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \ - --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \ - --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \ - --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \ - --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \ - --max_template_date=2020-05-14 \ - --input_json data/demo_protein_ligand.json \ - --output_dir ./output \ - --model_name allatom_demo \ - --init_model ./init_models/checkpoints.pdparams \ - --infer_times 3 \ - --precision "fp32" +##### Run from default config +```shell +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ +helixfold \ + input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json \ + output=. CONFIG_DIFFS.preset=allatom_demo ``` + +##### Run with customized configuration dir and file(`./myfold.yaml`, for example): +```shell +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ +helixfold --config-dir=. --config-name=myfold \ + input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json \ + output=. CONFIG_DIFFS.preset=allatom_demo +``` + The descriptions of the above script are as follows: -* Replace `MAXIT_SRC` with your installed `maxit`'s root path. -* Replace `DATA_DIR` with your downloaded data path. -* Replace `OBABEL_BIN` with your installed `openbabel` path. -* Replace `ENV_BIN` with your conda virtual environment or any environment where `hhblits`, `hmmsearch` and other dependencies have been installed. -* `--preset` - Set `'reduced_dbs'` to use small bfd or `'full_dbs'` to use full bfd. -* `--*_database_path` - Path to datasets you have downloaded. -* `--input_json` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference. -* `--output_dir` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. -* `--model_name` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. -* `--infer_time` - The number of inferences executed by model for single input. In each inference, the model will infer `5` times (`diff_batch_size`) for the same input by default. This hyperparameter can be changed by `model.head.diffusion_module.test_diff_batch_size` within `./helixfold/model/config.py` -* `--precision` - Either `bf16` or `fp32`. Please check if your machine can support `bf16` or not beforing changing it. For example, `bf16` is supported by A100 and H100 or higher version while V100 only supports `fp32`. +* `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.` +* `config-dir` - The directory that contains the alterative configuration file you would like to use. +* `config-name` - The name of the configuration file you would like to use. +* `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference. +* `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. +* `--CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. ### Understanding Model Output diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 623a3561..262cc517 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -12,6 +12,7 @@ amp_level: O1 # Corresponds to --amp_level infer_times: 1 # Corresponds to --infer_times diff_batch_size: -1 # Corresponds to --diff_batch_size use_small_bfd: false # Corresponds to --use_small_bfd +msa_only: false # Only process msa # File paths diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index d26240cf..0ab811b9 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -39,12 +39,13 @@ from helixfold.data.tools import hmmsearch from helixfold.data import templates from helixfold.utils.utils import get_custom_amp_list -from helixfold.utils.model import RunModel from helixfold.utils.misc import set_logging_level from typing import Dict from helixfold.infer_scripts import feature_processing_aa, preprocess from helixfold.infer_scripts.tools import mmcif_writer +from apps.protein_folding.helixfold.alphafold_paddle.model.model import RunModel + script_path=os.path.dirname(__file__) ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP diff --git a/apps/protein_folding/helixfold3/requirements.txt b/apps/protein_folding/helixfold3/requirements.txt deleted file mode 100644 index 660e43c1..00000000 --- a/apps/protein_folding/helixfold3/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -absl-py==0.13.0 -biopython==1.79 -chex==0.0.7 -dm-haiku==0.0.4 -dm-tree==0.1.6 -docker==5.0.0 -immutabledict==2.0.0 -jax==0.2.14 -ml-collections==0.1.0 -pandas==1.3.4 -scipy==1.9.0 -rdkit-pypi==2022.9.5 -posebusters \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh deleted file mode 100644 index 5b0644e5..00000000 --- a/apps/protein_folding/helixfold3/run_infer.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -PYTHON_BIN="/usr/bin/python3" # changes to your python -ENV_BIN="/root/miniconda3/bin" # change to your env -MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT -export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel -DATA_DIR="./data" -export PATH="$MAXIT_SRC/bin:$PATH" - -CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ - --maxit_binary "$MAXIT_SRC/bin/maxit" \ - --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \ - --hhblits_binary_path "$ENV_BIN/hhblits" \ - --hhsearch_binary_path "$ENV_BIN/hhsearch" \ - --kalign_binary_path "$ENV_BIN/kalign" \ - --hmmsearch_binary_path "$ENV_BIN/hmmsearch" \ - --hmmbuild_binary_path "$ENV_BIN/hmmbuild" \ - --nhmmer_binary_path "$ENV_BIN/nhmmer" \ - --preset='reduced_dbs' \ - --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \ - --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \ - --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \ - --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \ - --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \ - --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \ - --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \ - --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \ - --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \ - --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \ - --max_template_date=2020-05-14 \ - --input_json data/demo_6zcy.json \ - --output_dir ./output \ - --model_name allatom_demo \ - --init_model init_models/HelixFold3-240814.pdparams \ - --infer_times 1 \ - --diff_batch_size 1 \ - --precision "fp32" \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh deleted file mode 100644 index 30f008d6..00000000 --- a/apps/protein_folding/helixfold3/setup_env.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -ENV_NAME='helixfold' -CUDA=12.0 - -# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit - -# Install py env -conda create -n ${ENV_NAME} -y -c conda-forge pip python=3.9; -source activate ${ENV_NAME} -conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia - -conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -conda install -y -c conda-forge openbabel - -python -m pip install --upgrade 'pip<24';pip install . --no-cache-dir - -pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl From 84a550cb770b1ca827d1f263dd605282d42e534d Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 22:01:36 +0800 Subject: [PATCH 14/42] fix: imports --- apps/protein_folding/helixfold3/README.md | 3 +++ .../helixfold3/helixfold/config/helixfold.yaml | 14 +++++++------- .../helixfold3/helixfold/inference.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index bdf4264e..c0bed52c 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -150,6 +150,8 @@ helixfold --config-dir=. --config-name=myfold \ output=. CONFIG_DIFFS.preset=allatom_demo ``` + + The descriptions of the above script are as follows: * `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.` * `config-dir` - The directory that contains the alterative configuration file you would like to use. @@ -158,6 +160,7 @@ The descriptions of the above script are as follows: * `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. * `--CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. + ### Understanding Model Output The outputs will be in a subfolder of `output_dir`, including the computed MSAs, predicted structures, diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 262cc517..7de2fd7a 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -62,10 +62,10 @@ other: # CONFIG_DIFFS for advanced configuration CONFIG_DIFFS: preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1'] - model: - global_config: - subbatch_size: 96 # model.global_config.subbatch_size - num_recycle: 3 # model.num_recycle - heads: - confidence_head: - weight: 0.0 # model.heads.confidence_head.weight + # model: + # global_config: + # subbatch_size: 96 # model.global_config.subbatch_size + # num_recycle: 3 # model.num_recycle + # heads: + # confidence_head: + # weight: 0.0 # model.heads.confidence_head.weight diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 0ab811b9..0213b725 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -36,6 +36,7 @@ from helixfold.data import pipeline_rna_parallel as pipeline_rna from helixfold.data import pipeline_rna_multimer from helixfold.data.utils import atom_level_keys, map_to_continuous_indices +from helixfold.utils.model import RunModel from helixfold.data.tools import hmmsearch from helixfold.data import templates from helixfold.utils.utils import get_custom_amp_list @@ -44,7 +45,6 @@ from helixfold.infer_scripts import feature_processing_aa, preprocess from helixfold.infer_scripts.tools import mmcif_writer -from apps.protein_folding.helixfold.alphafold_paddle.model.model import RunModel script_path=os.path.dirname(__file__) From 21d41088f0d10ab7c1f1da5a658a2ff3b9a44dd0 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 22:40:49 +0800 Subject: [PATCH 15/42] fix: config overriden --- apps/protein_folding/helixfold3/README.md | 14 +++++++-- .../helixfold3/helixfold/inference.py | 1 + .../helixfold3/helixfold/model/config.py | 30 +++++++++---------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index c0bed52c..7c1ba474 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -150,7 +150,15 @@ helixfold --config-dir=. --config-name=myfold \ output=. CONFIG_DIFFS.preset=allatom_demo ``` - +##### Run with additional configuration term +```shell +LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH \ +helixfold \ + input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy.json \ + output=. \ + CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \ + CONFIG_DIFFS.model.global_config.subbatch_size=192 +``` The descriptions of the above script are as follows: * `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.` @@ -158,8 +166,8 @@ The descriptions of the above script are as follows: * `config-name` - The name of the configuration file you would like to use. * `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference. * `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. -* `--CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. - +* `CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. +* `CONFIG_DIFFS.*` - Override model any configuration in `CONFIG_ALLATOM`. ### Understanding Model Output diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 0213b725..a326bbd0 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -491,6 +491,7 @@ def main(cfg: DictConfig): ### Create model model_config = config.model_config(cfg.CONFIG_DIFFS) + logging.warning(f'>>> Model config: \n{model_config}\n\n') model = RunModel(model_config) diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py index c4682b29..f6d2fe97 100644 --- a/apps/protein_folding/helixfold3/helixfold/model/config.py +++ b/apps/protein_folding/helixfold3/helixfold/model/config.py @@ -16,7 +16,6 @@ import copy from typing import Any, Union -import ml_collections from omegaconf import DictConfig @@ -26,7 +25,7 @@ NUM_TEMPLATES = 'num templates placeholder' -def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> ml_collections.ConfigDict: +def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> DictConfig: """Get the ConfigDict of a model.""" cfg = copy.deepcopy(CONFIG_ALLATOM) @@ -37,34 +36,33 @@ def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]] if isinstance(config_diffs, DictConfig): if 'preset' in config_diffs and (preset_name:=config_diffs['preset']) in CONFIG_DIFFS: updated_config=CONFIG_DIFFS[preset_name] - cfg.update_from_flattened_dict(updated_config) + cfg.merge_with_dotlist(updated_config) print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}') return cfg # update from detailed configuration if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM): - for root_kw in CONFIG_ALLATOM: - if root_kw in config_diffs and (updated_config := config_diffs[root_kw]) is not None: - cfg.update(dict(updated_config)) - print(f'Updated config from `CONFIG_DIFFS`: {updated_config}') + + cfg.merge_with(config_diffs) # merge to override + print(f'Updated config from `CONFIG_DIFFS`: {config_diffs}') return cfg raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}') # preset for runs -CONFIG_DIFFS = { - 'allatom_demo': { - 'model.heads.confidence_head.weight': 0.01 - }, - 'allatom_subbatch_64_recycle_1': { - 'model.global_config.subbatch_size': 64, - 'model.num_recycle': 1, - }, +CONFIG_DIFFS: dict[str, list[str]] = { + 'allatom_demo': [ + 'model.heads.confidence_head.weight=0.01' + ], + 'allatom_subbatch_64_recycle_1': [ + 'model.global_config.subbatch_size=64', + 'model.num_recycle=1', + ] } -CONFIG_ALLATOM = ml_collections.ConfigDict({ +CONFIG_ALLATOM = DictConfig({ 'data': { 'num_blocks': 5, # for msa block deletion 'randomize_num_blocks': True, From 75616744852b63312eff2446b3232fb600aafee5 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:06:06 +0800 Subject: [PATCH 16/42] doc: update for cli --- apps/protein_folding/helixfold3/README.md | 25 +++++++++++-------- .../helixfold/config/helixfold.yaml | 2 ++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index 7c1ba474..cc1a3109 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -36,7 +36,7 @@ Those settings are recommended as they are the same as we used in our A100 machi ### Installation HelixFold3 depends on [PaddlePaddle](https://github.com/paddlepaddle/paddle). Python dependencies available through `pip` -is provided in `requirements.txt`. `kalign`, the [`HH-suite`](https://github.com/soedinglab/hh-suite) and `jackhmmer` are +is provided with `pyproject.toml`. `kalign`, the [`HH-suite`](https://github.com/soedinglab/hh-suite) and `jackhmmer` are also needed to produce multiple sequence alignments. The download scripts require `aria2c`. Locate to the directory of `helixfold` then run: @@ -50,7 +50,7 @@ conda activate helixfold # adjust these version numbers as your situation conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia -conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 +conda install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 conda install -y -c conda-forge openbabel # install paddlepaddle @@ -60,7 +60,7 @@ pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/w # downgrade pip pip install --upgrade 'pip<24' -# edit configuration file at `/helixfold/config/helixfold.yaml` to set your databases and binaries correctly +# edit configuration file at `./helixfold/config/helixfold.yaml` to set your databases and binaries correctly. # install HF3 as a python library pip install . --no-cache-dir @@ -138,26 +138,29 @@ The script is as follows, ```shell LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ helixfold \ - input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json \ - output=. CONFIG_DIFFS.preset=allatom_demo + input=./data/demo_8ecx.json \ + output=. \ + CONFIG_DIFFS.preset=allatom_demo ``` ##### Run with customized configuration dir and file(`./myfold.yaml`, for example): ```shell LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ helixfold --config-dir=. --config-name=myfold \ - input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json \ - output=. CONFIG_DIFFS.preset=allatom_demo + input=./data/demo_6zcy_smiles.json \ + output=. \ + CONFIG_DIFFS.preset=allatom_demo ``` ##### Run with additional configuration term ```shell -LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH \ +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ helixfold \ - input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy.json \ + input=./data/demo_6zcy.json \ output=. \ CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \ - CONFIG_DIFFS.model.global_config.subbatch_size=192 + CONFIG_DIFFS.model.global_config.subbatch_size=192 \ + CONFIG_DIFFS.model.num_recycle=10 ``` The descriptions of the above script are as follows: @@ -166,7 +169,7 @@ The descriptions of the above script are as follows: * `config-name` - The name of the configuration file you would like to use. * `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference. * `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. -* `CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. +* `CONFIG_DIFFS.preset` - Adjusted model config preset name in `./helixfold/model/config.py:CONFIG_DIFFS`. The preset will be updated into final model configuration with `CONFIG_ALLATOM`. * `CONFIG_DIFFS.*` - Override model any configuration in `CONFIG_ALLATOM`. ### Understanding Model Output diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 7de2fd7a..0015b4d5 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -62,6 +62,8 @@ other: # CONFIG_DIFFS for advanced configuration CONFIG_DIFFS: preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1'] + + # Detailed configuration adjustments against `CONFIG_ALLATOM` can be used here. for example: # model: # global_config: # subbatch_size: 96 # model.global_config.subbatch_size From e8582de19e05b467c366b5e3ef6438c3dd4a3da2 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:17:17 +0800 Subject: [PATCH 17/42] doc: update for cli --- apps/protein_folding/helixfold3/README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index cc1a3109..c8fc9c8c 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -128,12 +128,7 @@ A example of input data is as follows: #### Running HelixFold for Inference To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.: -* Inference on single GPU (change the settings in script BEFORE you run it) -``` -sh run_infer.sh -``` -The script is as follows, ##### Run from default config ```shell LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ From cd924293fd7782da0bf70459b3bdfebfed6761c5 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sun, 18 Aug 2024 13:24:37 +0800 Subject: [PATCH 18/42] fix: msa parallel for bfd --- .../helixfold/common/all_atom_pdb_save.py | 4 +- .../helixfold/config/helixfold.yaml | 6 +- .../helixfold/data/pipeline_parallel.py | 161 +++++++++--------- .../helixfold3/helixfold/inference.py | 4 +- .../protein_folding/helixfold3/pyproject.toml | 1 + 5 files changed, 89 insertions(+), 87 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index abc33ef6..50b2500a 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -182,7 +182,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], '-output', mmcif_path, ] - print('Launching subprocess "%s"', ' '.join(cmd)) + print(f'Launching subprocess "{" " .join(cmd)}"', ) if os.path.exists('maxit.log'): os.remove('maxit.log') @@ -197,7 +197,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], if retcode: - # Logs have a 15k character limit, so log HHblits error line by line. + # Logs have a 15k character limit, so log Maxit error line by line. print('Maxit failed. Maxit stderr begin:') raise RuntimeError(f'Maxit failed\nstdout:\n{stdout.decode("utf-8")}\n\n' f'stderr:\n{stderr[:500_000].decode("utf-8")}\n' diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 0015b4d5..5f51d2be 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -14,6 +14,10 @@ diff_batch_size: -1 # Corresponds to --diff_batch_size use_small_bfd: false # Corresponds to --use_small_bfd msa_only: false # Only process msa +nproc_msa: + hhblits: 16 + jackhmmer: 8 + # File paths input: null # Corresponds to --input_json, required field @@ -52,7 +56,7 @@ template: # Preset configuration preset: - preset: reduced_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] + preset: full_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] # Other configurations other: diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py index 3ca52693..a1467d2d 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py @@ -15,7 +15,7 @@ """Functions for building the input features for the HelixFold model.""" import os -from typing import Any, Mapping, MutableMapping, Optional, Sequence, Union +from typing import Any, Mapping, MutableMapping, Optional, Protocol, Sequence, Tuple, Union from absl import logging from helixfold.common import residue_constants from helixfold.data import msa_identifiers @@ -26,12 +26,16 @@ from helixfold.data.tools import hmmsearch from helixfold.data.tools import jackhmmer import numpy as np -from concurrent.futures import ProcessPoolExecutor, as_completed +from joblib import Parallel, delayed # Internal import (7716). FeatureDict = MutableMapping[str, np.ndarray] TemplateSearcher = Union[hhsearch.HHSearch, hmmsearch.Hmmsearch] +class MsaRunner(Protocol): + def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]: + """Runs the MSA tool on the input fasta file.""" + ... def make_sequence_features( sequence: str, description: str, num_res: int) -> FeatureDict: @@ -83,42 +87,41 @@ def make_msa_features(msas: Sequence[parsers.Msa]) -> FeatureDict: features['msa_species_identifiers'] = np.array(species_ids, dtype=np.object_) return features - -def run_msa_tool(msa_runner, input_fasta_path: str, msa_out_path: str, - msa_format: str, use_precomputed_msas: bool, - max_sto_sequences: Optional[int] = None - ) -> Mapping[str, Any]: - """Runs an MSA tool, checking if output already exists first.""" - if not use_precomputed_msas or not os.path.exists(msa_out_path): - if msa_format == 'sto' and max_sto_sequences is not None: - print('pipeline:',input_fasta_path,max_sto_sequences) - result = msa_runner.query(input_fasta_path, max_sto_sequences)[0] # pytype: disable=wrong-arg-count +# Yinying edited here to change various position args as one tuple for multiprocess tests +def run_msa_tool(args: Tuple[MsaRunner, str, str, str, bool, int]) -> Mapping[str, Any]: + if args == None: + return None + if (len_args:=len(args))!=6: + raise ValueError(f'MsaRunner must have exactly 6 arguments but got {len_args}') + + (msa_runner, input_fasta_path, msa_out_path, + msa_format, use_precomputed_msas, + max_sto_sequences) = args + #print(args) + """Runs an MSA tool, checking if output already exists first.""" + if not use_precomputed_msas or not os.path.exists(msa_out_path): + if msa_format == 'sto' and max_sto_sequences > 0: + result = msa_runner.query(input_fasta_path, max_sto_sequences)[0] # pytype: disable=wrong-arg-count + else: + result = msa_runner.query(input_fasta_path)[0] + with open(msa_out_path, 'w') as f: + f.write(result[msa_format]) else: - result = msa_runner.query(input_fasta_path)[0] - with open(msa_out_path, 'w') as f: - f.write(result[msa_format]) - else: + result=read_msa_result(msa_out_path,msa_format,max_sto_sequences) + return result + +def read_msa_result(msa_out_path,msa_format,max_sto_sequences): logging.warning('Reading MSA from file %s', msa_out_path) - if msa_format == 'sto' and max_sto_sequences is not None: - precomputed_msa = parsers.truncate_stockholm_msa( - msa_out_path, max_sto_sequences) - result = {'sto': precomputed_msa} + if msa_format == 'sto' and max_sto_sequences > 0: + precomputed_msa = parsers.truncate_stockholm_msa( + msa_out_path, max_sto_sequences) + result = {'sto': precomputed_msa} else: - with open(msa_out_path, 'r') as f: - result = {msa_format: f.read()} - return result - -def run_msa_tool_wrapper(args): - """ - 用于包装run_msa_tool函数的帮助程序,以便在使用argparse时可以更轻松地传递参数。 - - Args: - args (tuple, list): 一个元组或列表,其中包含要传递给run_msa_tool函数的参数。 - - Returns: - int: 返回run_msa_tool函数的返回值。 - """ - return run_msa_tool(*args) + with open(msa_out_path, 'r') as f: + result = {msa_format: f.read()} + return result + + class DataPipeline: @@ -138,29 +141,38 @@ def __init__(self, use_small_bfd: bool, mgnify_max_hits: int = 501, uniref_max_hits: int = 10000, - use_precomputed_msas: bool = False): + use_precomputed_msas: bool = False, + nprocs: Mapping[str, int] = { + 'hhblits': 16, + 'jackhmmer': 8, + }): """Initializes the data pipeline. Constructs a feature dict for a given FASTA file.""" self._use_small_bfd = use_small_bfd + self.nprocs=nprocs self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer( binary_path=jackhmmer_binary_path, - database_path=uniref90_database_path) + database_path=uniref90_database_path, n_cpu=self.nprocs.get('jackhmmer', 8)) if use_small_bfd: - self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer( + self.bfd_runner = jackhmmer.Jackhmmer( binary_path=jackhmmer_binary_path, - database_path=small_bfd_database_path) + database_path=small_bfd_database_path, n_cpu=self.nprocs.get('jackhmmer', 8)) else: - self.hhblits_bfd_uniclust_runner = hhblits.HHBlits( + self.bfd_runner = hhblits.HHBlits( binary_path=hhblits_binary_path, - databases=[bfd_database_path, uniclust30_database_path]) + databases=[bfd_database_path, uniclust30_database_path], n_cpu=self.nprocs.get('hhblits', 8)) self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer( binary_path=jackhmmer_binary_path, - database_path=mgnify_database_path) + database_path=mgnify_database_path, n_cpu=self.nprocs.get('jackhmmer', 8)) self.template_searcher = template_searcher self.template_featurizer = template_featurizer self.mgnify_max_hits = mgnify_max_hits self.uniref_max_hits = uniref_max_hits self.use_precomputed_msas = use_precomputed_msas + def parallel_msa_joblib(self, func, input_args: list): + return Parallel(len(input_args))(delayed(func)(args) for args in input_args) + + def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: """Runs alignment tools on the input sequence and creates features.""" with open(input_fasta_path) as f: @@ -202,44 +214,27 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: input_fasta_path, os.path.join(msa_output_dir, 'mgnify_hits.sto'), 'sto', - self.use_precomputed_msas)) + self.use_precomputed_msas, + self.mgnify_max_hits)) - if self._use_small_bfd: - msa_tasks.append(( - self.jackhmmer_small_bfd_runner, - input_fasta_path, - os.path.join(msa_output_dir, 'small_bfd_hits.sto'), - 'sto', - self.use_precomputed_msas)) - else: - msa_tasks.append(( - self.hhblits_bfd_uniclust_runner, - input_fasta_path, - os.path.join(msa_output_dir, 'bfd_uniclust_hits.a3m'), - 'a3m', - self.use_precomputed_msas)) - - msa_results = {} - with ProcessPoolExecutor() as executor: - futures = {executor.submit(run_msa_tool_wrapper, msa_task): msa_task for msa_task in msa_tasks} - - for future in as_completed(futures): - task = futures[future] - try: - result = future.result() - if 'uniref90_hits.sto' in task[2]: - msa_results['uniref90'] = result - elif 'mgnify_hits.sto' in task[2]: - msa_results['mgnify'] = result - elif 'small_bfd_hits.sto' in task[2]: - msa_results['small_bfd'] = result - elif 'bfd_uniclust_hits.a3m' in task[2]: - msa_results['bfd_uniclust'] = result - - except Exception as exc: - print(f'Task {task} generated an exception : {exc}') - - msa_for_templates = msa_results['uniref90']['sto'] + + msa_tasks.append(( + self.bfd_runner, + input_fasta_path, + os.path.join(msa_output_dir, 'small_bfd_hits.sto' if self._use_small_bfd else 'bfd_uniclust_hits.a3m'), + 'sto' if self._use_small_bfd else 'a3m', + self.use_precomputed_msas, + 0)) + + + [ + jackhmmer_uniref90_result, + jackhmmer_mgnify_result, + bfd_result, + ] = self.parallel_msa_joblib(func=run_msa_tool, + input_args=msa_tasks) + + msa_for_templates = jackhmmer_uniref90_result['sto'] msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates) msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(msa_for_templates) @@ -257,16 +252,16 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: with open(pdb_hits_out_path, 'w') as f: f.write(pdb_templates_result) - uniref90_msa = parsers.parse_stockholm(msa_results['uniref90']['sto']) - mgnify_msa = parsers.parse_stockholm(msa_results['mgnify']['sto']) + uniref90_msa = parsers.parse_stockholm(jackhmmer_uniref90_result['sto']) + mgnify_msa = parsers.parse_stockholm(jackhmmer_mgnify_result['sto']) pdb_template_hits = self.template_searcher.get_template_hits( output_string=pdb_templates_result, input_sequence=input_sequence) if self._use_small_bfd: - bfd_msa = parsers.parse_stockholm(msa_results['small_bfd']['sto']) + bfd_msa = parsers.parse_stockholm(bfd_result['sto']) else: - raise ValueError("Doesn't support full BFD yet.") + bfd_msa = parsers.parse_a3m(bfd_result['a3m']) templates_result = self.template_featurizer.get_templates( query_sequence=input_sequence, diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index a326bbd0..3e028d1d 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -152,7 +152,9 @@ def get_msa_templates_pipeline(cfg: DictConfig) -> Dict: template_searcher=template_searcher, template_featurizer=template_featurizer, use_small_bfd=cfg.use_small_bfd, - use_precomputed_msas=use_precomputed_msas) + use_precomputed_msas=use_precomputed_msas, + nprocs=cfg.nproc_msa, + ) prot_data_pipeline = pipeline_multimer.DataPipeline( monomer_data_pipeline=monomer_data_pipeline, diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml index bc988ca9..8c6dc9f7 100644 --- a/apps/protein_folding/helixfold3/pyproject.toml +++ b/apps/protein_folding/helixfold3/pyproject.toml @@ -41,6 +41,7 @@ rdkit-pypi = "2022.9.5" posebusters = "*" hydra-core= "^1.3.2" omegaconf = "^2.3.0" +joblib = "1.4.2" From 8c22e155cbb768e4434b20bc82bc2920391ffdc3 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sun, 18 Aug 2024 14:02:33 +0800 Subject: [PATCH 19/42] fix: multimer msa parallel --- .../helixfold/config/helixfold.yaml | 4 +-- .../data/pipeline_multimer_parallel.py | 33 ++++++++++++++----- .../helixfold/data/pipeline_parallel.py | 17 ++++++++-- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 5f51d2be..bc163019 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -15,8 +15,8 @@ use_small_bfd: false # Corresponds to --use_small_bfd msa_only: false # Only process msa nproc_msa: - hhblits: 16 - jackhmmer: 8 + hhblits: 16 # Number of processors used by hhblits + jackhmmer: 8 # Number of processors used by jackhmmer # File paths diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py index 9d595a07..251503aa 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py @@ -14,7 +14,8 @@ from helixfold.data import feature_processing from helixfold.data import msa_pairing from helixfold.data import parsers -from helixfold.data import pipeline +#from helixfold.data import pipeline +from helixfold.data import pipeline_parallel as pipeline from helixfold.data.tools import jackhmmer import numpy as np import multiprocessing @@ -197,24 +198,38 @@ def _process_single_chain( with temp_fasta_file(chain_fasta_str) as chain_fasta_path: logging.info('Running monomer pipeline on chain %s: %s', chain_id, description) + + # We only construct the pairing features if there are 2 or more unique + # sequences. + self.jackhmmer_uniprot_args=( + self._uniprot_msa_runner, + str(chain_fasta_path), + os.path.join(chain_msa_output_dir, 'uniprot_hits.sto'), + 'sto', + self.use_precomputed_msas, + 0 + ) + chain_features = self._monomer_data_pipeline.process( input_fasta_path=chain_fasta_path, - msa_output_dir=chain_msa_output_dir) + msa_output_dir=chain_msa_output_dir, + other_args=self.jackhmmer_uniprot_args if not is_homomer_or_monomer else None) # We only construct the pairing features if there are 2 or more unique # sequences. if not is_homomer_or_monomer: - all_seq_msa_features = self._all_seq_msa_features(chain_fasta_path, - chain_msa_output_dir) + all_seq_msa_features = self._all_seq_msa_features(chain_msa_output_dir) chain_features.update(all_seq_msa_features) return chain_features - def _all_seq_msa_features(self, input_fasta_path, msa_output_dir): + def _all_seq_msa_features(self, msa_output_dir): """Get MSA features for unclustered uniprot, for pairing.""" - out_path = os.path.join(msa_output_dir, 'uniprot_hits.sto') - result = pipeline.run_msa_tool( - self._uniprot_msa_runner, input_fasta_path, out_path, 'sto', - self.use_precomputed_msas) + # edited by yinying to adapt to the multiprocess version of run_msa_tool function + result = pipeline.read_msa_result( + msa_out_path=os.path.join(msa_output_dir, 'uniprot_hits.sto'), + msa_format='sto', + max_sto_sequences=0 + ) msa = parsers.parse_stockholm(result['sto']) msa = msa.truncate(max_seqs=self._max_uniprot_hits) all_seq_features = pipeline.make_msa_features([msa]) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py index a1467d2d..53bb0445 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py @@ -33,10 +33,17 @@ TemplateSearcher = Union[hhsearch.HHSearch, hmmsearch.Hmmsearch] class MsaRunner(Protocol): + n_cpu: int + def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]: """Runs the MSA tool on the input fasta file.""" ... +def check_used_ncpus(used: list[int]): + ncpus_sum=sum(used) + if ncpus_sum >os.cpu_count(): + logging.warning(f"The number of used CPUs({ncpus_sum}) is larger than the number of available CPUs({os.cpu_count()}).") + def make_sequence_features( sequence: str, description: str, num_res: int) -> FeatureDict: """Constructs a feature dict of sequence features.""" @@ -170,10 +177,10 @@ def __init__(self, self.use_precomputed_msas = use_precomputed_msas def parallel_msa_joblib(self, func, input_args: list): - return Parallel(len(input_args))(delayed(func)(args) for args in input_args) + return Parallel(len(input_args),verbose=100)(delayed(func)(args) for args in input_args) - def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: + def process(self, input_fasta_path: str, msa_output_dir: str,other_args: Optional[tuple] = None) -> FeatureDict: """Runs alignment tools on the input sequence and creates features.""" with open(input_fasta_path) as f: input_fasta_str = f.read() @@ -186,7 +193,7 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: num_res = len(input_sequence) - msa_tasks = [] + msa_tasks: list[Tuple[MsaRunner, str, str, str, bool, int]] = [] """uniref90_out_path = os.path.join(msa_output_dir, 'uniref90_hits.sto') jackhmmer_uniref90_result = run_msa_tool( msa_runner=self.jackhmmer_uniref90_runner, @@ -225,12 +232,16 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: 'sto' if self._use_small_bfd else 'a3m', self.use_precomputed_msas, 0)) + + msa_tasks.append(other_args) + check_used_ncpus(used=[mask[0].n_cpu for mask in msa_tasks if hasattr(mask[0], 'n_cpu')]) [ jackhmmer_uniref90_result, jackhmmer_mgnify_result, bfd_result, + ] = self.parallel_msa_joblib(func=run_msa_tool, input_args=msa_tasks) From 8ce13ca2262983f93af0f89d7e3d2bd39d50dbb5 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sun, 18 Aug 2024 14:07:54 +0800 Subject: [PATCH 20/42] fix: multimer msa parallel --- .../helixfold3/helixfold/data/pipeline_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py index 53bb0445..ade48cb5 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py @@ -241,6 +241,7 @@ def process(self, input_fasta_path: str, msa_output_dir: str,other_args: Optiona jackhmmer_uniref90_result, jackhmmer_mgnify_result, bfd_result, + other_result ] = self.parallel_msa_joblib(func=run_msa_tool, input_args=msa_tasks) From 8548570c8470ca53b6cc438fef3ee0fd496bbd0f Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sun, 18 Aug 2024 14:25:56 +0800 Subject: [PATCH 21/42] drop: buildin logger --- .../infer_scripts/feature_processing_aa.py | 12 +++--- .../helixfold3/helixfold/inference.py | 22 +++++------ .../helixfold3/helixfold/utils/misc.py | 39 ------------------- .../helixfold3/helixfold/utils/model.py | 4 +- 4 files changed, 19 insertions(+), 58 deletions(-) delete mode 100644 apps/protein_folding/helixfold3/helixfold/utils/misc.py diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py index b89f43f4..3db7640c 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py @@ -2,9 +2,10 @@ import collections import copy import os +from pathlib import Path import time, gzip, pickle import numpy as np -import logging +from absl import logging from helixfold.common import residue_constants from helixfold.data import parsers from helixfold.data import pipeline_multimer @@ -14,7 +15,6 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from .preprocess import digit2alphabet -logger = logging.getLogger(__file__) POLYMER_STANDARD_RESI_ATOMS = residue_constants.residue_atoms STRING_FEATURES = ['all_chain_ids', 'all_ccd_ids','all_atom_ids', @@ -356,7 +356,7 @@ def process_chain_msa(args): data_pipeline, chain_id, seq, desc, \ msa_output_dir, features_pkl = args if features_pkl.exists(): - logger.info('Use cached features.pkl') + logging.info('Use cached features.pkl') with open(features_pkl, 'rb') as f: raw_features = pickle.load(f) else: @@ -450,7 +450,7 @@ def process_input_json(all_entitys, ccd_preprocessed_path, ## 2. multiprocessing for protein/rna MSA/Template search. seqs_to_msa_features = {} - logger.info('[Multiprocess] starting MSA/Template search...') + logging.info('[Multiprocess] starting MSA/Template search...') t0 = time.time() with ProcessPoolExecutor() as executor: futures = [executor.submit(process_chain_msa, task) for task in tasks] @@ -461,8 +461,8 @@ def process_input_json(all_entitys, ccd_preprocessed_path, seqs_to_msa_features[seqs] = raw_features except Exception as exc: import traceback; traceback.print_exc() - logger.error(f'Task generated an exception : {exc}') - logger.info(f'[Multiprocess] All msa/template use: {time.time() - t0}') + logging.error(f'Task generated an exception : {exc}') + logging.info(f'[Multiprocess] All msa/template use: {time.time() - t0}') ## 3. add msa_templ_feats to all_chain_features. for type_chain_id in all_chain_features.keys(): diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 3e028d1d..7734c964 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -26,6 +26,8 @@ import numpy as np import shutil +from absl import logging + from omegaconf import DictConfig import hydra @@ -40,7 +42,6 @@ from helixfold.data.tools import hmmsearch from helixfold.data import templates from helixfold.utils.utils import get_custom_amp_list -from helixfold.utils.misc import set_logging_level from typing import Dict from helixfold.infer_scripts import feature_processing_aa, preprocess from helixfold.infer_scripts.tools import mmcif_writer @@ -68,7 +69,6 @@ RETURN_KEYS = ['diffusion_module', 'confidence_head'] -logger = logging.getLogger(__file__) MAX_TEMPLATE_HITS = 4 @@ -95,7 +95,7 @@ def preprocess_json_entity(json_path, out_dir): if all_entitys is None: raise ValueError("The json file does not contain any valid entity.") else: - logger.info("The json file contains %d valid entity.", len(all_entitys)) + logging.info("The json file contains %d valid entity.", len(all_entitys)) return all_entitys @@ -187,7 +187,7 @@ def ranking_all_predictions(output_dirs): ranked_map = dict(sorted(ranking_score_path_map.items(), key=lambda x: x[1], reverse=True)) rank_id = 1 for outpath, rank_score in ranked_map.items(): - logger.debug("[ranking_all_predictions] Ranking score of %s: %.5f", outpath, rank_score) + logging.debug("[ranking_all_predictions] Ranking score of %s: %.5f", outpath, rank_score) basename_prefix = os.path.basename(outpath).split('-pred-')[0] target_path = os.path.join(os.path.dirname(outpath), f'{basename_prefix}-rank{rank_id}') if os.path.exists(target_path) and os.path.isdir(target_path): @@ -216,7 +216,7 @@ def _forward_with_precision(batch): raise ValueError("Please choose precision from bf16 and fp32! ") res = _forward_with_precision(batch) - logger.info(f"Inference Succeeds...\n") + logging.info(f"Inference Succeeds...\n") return res @@ -452,7 +452,7 @@ def split_prediction(pred, rank): @hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold') def main(cfg: DictConfig): - set_logging_level(cfg.logging_level) + logging.set_verbosity(cfg.logging_level) """main function""" new_einsum = os.getenv("FLAGS_new_einsum", True) @@ -477,7 +477,7 @@ def main(cfg: DictConfig): if seed is None: seed = np.random.randint(10000000) else: - logger.warning('Seed is only used for reproduction') + logging.warning('Seed is only used for reproduction') init_seed(seed) use_small_bfd = cfg.preset.preset == 'reduced_dbs' @@ -488,7 +488,7 @@ def main(cfg: DictConfig): assert cfg.db.bfd is not None assert cfg.db.uniclust30 is not None - logger.info('Getting MSA/Template Pipelines...') + logging.info('Getting MSA/Template Pipelines...') msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg) ### Create model @@ -541,12 +541,12 @@ def main(cfg: DictConfig): if cfg.diff_batch_size > 0: model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size - logger.info(f'Inference {infer_times} Times...') - logger.info(f"Diffusion batch size {diff_batch_size}...\n") + logging.info(f'Inference {infer_times} Times...') + logging.info(f"Diffusion batch size {diff_batch_size}...\n") all_pred_path = [] for infer_id in range(infer_times): - logger.info(f'Start {infer_id}-th inference...\n') + logging.info(f'Start {infer_id}-th inference...\n') prediction = eval(cfg, model, feature_dict) # save result diff --git a/apps/protein_folding/helixfold3/helixfold/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py deleted file mode 100644 index b8faa065..00000000 --- a/apps/protein_folding/helixfold3/helixfold/utils/misc.py +++ /dev/null @@ -1,39 +0,0 @@ -# copyright (c) 2024 PaddleHelix Authors. All Rights Reserve. -# -# Licensed under Creative Commons Attribution-NonCommercial-ShareAlike 4.0 -# International License (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://creativecommons.org/licenses/by-nc-sa/4.0/ -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -misc utils -""" - -import logging - -def set_logging_level(level): - level_dict = { - "NOTSET": logging.NOTSET, - "DEBUG": logging.DEBUG, - "INFO": logging.INFO, - "WARNING": logging.WARNING, - "ERROR": logging.ERROR, - "CRITICAL": logging.CRITICAL - } - logging.basicConfig( - format='%(asctime)s %(levelname)s %(message)s', - level=level_dict[level], - datefmt='%Y-%m-%d %H:%M:%S') - for h in logging.root.handlers: - h.setFormatter( - logging.Formatter( - '%(asctime)s %(levelname)s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' - )) - logging.root.setLevel(level_dict[level]) diff --git a/apps/protein_folding/helixfold3/helixfold/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py index 67b36128..885510bd 100644 --- a/apps/protein_folding/helixfold3/helixfold/utils/model.py +++ b/apps/protein_folding/helixfold3/helixfold/utils/model.py @@ -22,7 +22,7 @@ from helixfold.model import modules_all_atom from helixfold.model import utils -logger = logging.getLogger(__name__) +from absl import logging class RunModel(nn.Layer): """ @@ -69,7 +69,7 @@ def init_params(self, params_path: str): utils.pd_params_merge_qkvw(pd_params) elif params_path.endswith('.pd') or params_path.endswith('.pdparams'): - logger.info('Load as Paddle model') + logging.info('Load as Paddle model') pd_params = paddle.load(params_path) else: From d2feac530ce12b066f67622ad489e8059dad6fdf Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 11:08:11 +0800 Subject: [PATCH 22/42] fix: nested parallel runs for protein chains --- .../data/demo_7u7w_protein_nucleic.json | 19 +++++++++ .../data/demo_7u7w_protein_nucleic_sm.json | 24 ++++++++++++ .../helixfold/data/pipeline_conf_bonds.py | 6 +-- .../helixfold/data/pipeline_token_feature.py | 39 +++++++++++++------ .../infer_scripts/feature_processing_aa.py | 39 ++++++++----------- 5 files changed, 91 insertions(+), 36 deletions(-) create mode 100644 apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json create mode 100644 apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json diff --git a/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json new file mode 100644 index 00000000..b846bea4 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json @@ -0,0 +1,19 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "GPHMATGQDRVVALVDMDCFFVQVEQRQNPHLRNKPCAVVQYKSWKGGGIIAVSYEARAFGVTRSMWADDAKKLCPDLLLAQVRESRGKANLTKYREASVEVMEIMSRFAVIERASIDEAYVDLTSAVQERLQKLQGQPISADLLPSTYIEGLPQGPTTAEETVQKEGMRKQGLFQWLDSLQIDNLTSPDLQLTVGAVIVEEMRAAIERETGFQCSAGISHNKVLAKLACGLNKPNRQTLVSHGSVPQLFSQMPIRKIRSLGGKLGASVIEILGIEYMGELTQFTESQLQSHFGEKNGSWLYAMCRGIEHDPVKPRQLPKTIGCSKNFPGKTALATREQVQWWLLQLAQELEERLTKDRNDNDRVATQLVVSIRVQGDKRLSSLRRCCALTRYDAHKMSHDAFTVIKNCNTSGIQTEWSPPLTMLFLCATKFSAS", + "count": 1 + }, + { + "type": "dna", + "sequence": "CATTATGACGCT", + "count": 1 + }, + { + "type": "dna", + "sequence": "AGCGTCAT", + "count": 1 + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json new file mode 100644 index 00000000..77b4dbd1 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json @@ -0,0 +1,24 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "GPHMATGQDRVVALVDMDCFFVQVEQRQNPHLRNKPCAVVQYKSWKGGGIIAVSYEARAFGVTRSMWADDAKKLCPDLLLAQVRESRGKANLTKYREASVEVMEIMSRFAVIERASIDEAYVDLTSAVQERLQKLQGQPISADLLPSTYIEGLPQGPTTAEETVQKEGMRKQGLFQWLDSLQIDNLTSPDLQLTVGAVIVEEMRAAIERETGFQCSAGISHNKVLAKLACGLNKPNRQTLVSHGSVPQLFSQMPIRKIRSLGGKLGASVIEILGIEYMGELTQFTESQLQSHFGEKNGSWLYAMCRGIEHDPVKPRQLPKTIGCSKNFPGKTALATREQVQWWLLQLAQELEERLTKDRNDNDRVATQLVVSIRVQGDKRLSSLRRCCALTRYDAHKMSHDAFTVIKNCNTSGIQTEWSPPLTMLFLCATKFSAS", + "count": 1 + }, + { + "type": "dna", + "sequence": "CATTATGACGCT", + "count": 1 + }, + { + "type": "dna", + "sequence": "AGCGTCAT", + "count": 1 + }, + { + "type": "ligand", + "ccd": "XG4", + "count": 1 + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index dee10bb0..da80d9af 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -107,9 +107,9 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, features[k] = np.concatenate(v, axis=0) features['ref_atom_count'] = np.bincount(features['ref_token2atom_idx']) - assert np.max(features['ref_element']) < 128 - assert np.max(features['ref_atom_name_chars']) < 64 - assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. + assert np.max(features['ref_element']) < 128 # WTF? + assert np.max(features['ref_atom_name_chars']) < 64 # WTF? + assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. # WTF? return features diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py index 7bbb1805..6c16a2c4 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py @@ -3,16 +3,17 @@ import collections import os import time -from typing import MutableMapping, Optional, List +from typing import Any, MutableMapping, Optional, List from absl import logging from helixfold.common import residue_constants from helixfold.data import parsers +from helixfold.data.tools import utils import numpy as np import json import gzip import pickle from rdkit import Chem - + FeatureDict = MutableMapping[str, np.ndarray] ELEMENT_MAPPING = Chem.GetPeriodicTable() @@ -56,6 +57,15 @@ def flatten_is_protein_features(is_protein_feats: np.ndarray) -> FeatureDict: return res + +def dump_all_ccd_keys(ccd_data: dict[str, Any]): + ccd_keys_file='all_ccd_keys.txt' + if not os.path.isfile(ccd_keys_file): + open(ccd_keys_file, 'w').write('\n'.join(ccd_data.keys())) + logging.warning(f'All ccd keys are dumped to {ccd_keys_file}') + return ccd_keys_file + + def make_sequence_features( all_chain_info, ccd_preprocessed_dict, extra_feats: Optional[dict]=None) -> FeatureDict: @@ -106,13 +116,18 @@ def make_sequence_features( sym_id = chainid_to_sym_id[_alphabet_chain_id] for residue_id, ccd_id in enumerate(ccd_seq): if ccd_id not in ccd_preprocessed_dict: - assert not extra_feats is None and ccd_id in extra_feats,\ - f'<{ccd_id}> not in ccd_preprocessed_dict, But got extra_feats is None' + if extra_feats is None: + ccd_kf=dump_all_ccd_keys(ccd_preprocessed_dict) + raise ValueError(f'<{ccd_id}> not in ccd_preprocessed_dict, But got extra_feats is None. See all keys in {ccd_kf}') + if ccd_id not in extra_feats: + ccd_kf=dump_all_ccd_keys(ccd_preprocessed_dict) + raise ValueError(f'<{ccd_id}> not in ccd_preprocessed_dict or extra_feats. See all keys in {ccd_kf}') _ccd_feats = extra_feats[ccd_id] else: _ccd_feats = ccd_preprocessed_dict[ccd_id] num_atoms = len(_ccd_feats['position']) - assert num_atoms > 0, f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.' + if num_atoms == 0: + raise NotImplementedError(f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.') if ccd_id not in residue_constants.STANDARD_LIST: features['asym_id'].append(np.array([chain_num_id] * num_atoms, dtype=np.int32)) @@ -198,12 +213,14 @@ def process(self, if ccd_preprocessed_dict is None: ccd_preprocessed_dict = {} - st_1 = time.time() - if 'pkl.gz' in self.ccd_preprocessed_path: - with gzip.open(self.ccd_preprocessed_path, "rb") as fp: - ccd_preprocessed_dict = pickle.load(fp) - logging.info(f'load ccd dataset done. use {time.time()-st_1}s') - + + if not 'pkl.gz' in self.ccd_preprocessed_path: + raise ValueError(f'Invalid ccd_preprocessed_path: {self.ccd_preprocessed_path}') + + with utils.timing('Loading CCD dataset'): + with gzip.open(self.ccd_preprocessed_path, "rb") as fp: + ccd_preprocessed_dict = pickle.load(fp) + if select_mmcif_chainID is not None: select_mmcif_chainID = set(select_mmcif_chainID) diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py index 3db7640c..cbec3867 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py @@ -4,15 +4,19 @@ import os from pathlib import Path import time, gzip, pickle +from typing import Optional, Tuple import numpy as np from absl import logging + + from helixfold.common import residue_constants from helixfold.data import parsers -from helixfold.data import pipeline_multimer +from helixfold.data.tools import utils +from helixfold.data import pipeline_multimer, pipeline_multimer_parallel from helixfold.data import pipeline_rna_multimer from helixfold.data import pipeline_conf_bonds, pipeline_token_feature, pipeline_hybrid from helixfold.data import label_utils -from concurrent.futures import ProcessPoolExecutor, as_completed + from .preprocess import digit2alphabet @@ -330,7 +334,7 @@ def add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_temp "label": label,} -def process_chain_msa(args): +def process_chain_msa(args: tuple[pipeline_multimer_parallel.DataPipeline, str, Optional[str],Optional[str], os.PathLike,os.PathLike ]) -> Tuple[str,dict, str, str]: """ 处理链,如果缓存了特征文件,则直接使用缓存的特征文件,否则生成新的特征文件。 @@ -360,12 +364,12 @@ def process_chain_msa(args): with open(features_pkl, 'rb') as f: raw_features = pickle.load(f) else: - t0 = time.time() - raw_features = data_pipeline._process_single_chain( - chain_id, sequence=seq, description=desc, - msa_output_dir=msa_output_dir, - is_homomer_or_monomer=False) - print(f'[MSA/Template] {desc}; seq length: {len(seq)}; use: {time.time() - t0}') + with utils.timing(f'[MSA/Template]({desc}) with seq length: {len(seq)}'): + raw_features = data_pipeline._process_single_chain( + chain_id, sequence=seq, description=desc, + msa_output_dir=msa_output_dir, + is_homomer_or_monomer=False) + with open(features_pkl, 'wb') as f: pickle.dump(raw_features, f, protocol=4) @@ -450,19 +454,10 @@ def process_input_json(all_entitys, ccd_preprocessed_path, ## 2. multiprocessing for protein/rna MSA/Template search. seqs_to_msa_features = {} - logging.info('[Multiprocess] starting MSA/Template search...') - t0 = time.time() - with ProcessPoolExecutor() as executor: - futures = [executor.submit(process_chain_msa, task) for task in tasks] - - for future in as_completed(futures): - try: - _, raw_features, type_chain_id, seqs = future.result() - seqs_to_msa_features[seqs] = raw_features - except Exception as exc: - import traceback; traceback.print_exc() - logging.error(f'Task generated an exception : {exc}') - logging.info(f'[Multiprocess] All msa/template use: {time.time() - t0}') + with utils.timing('MSA/Template search'): + for task in tasks: + _, raw_features, type_chain_id, seqs=process_chain_msa(task) + seqs_to_msa_features[seqs] = raw_features ## 3. add msa_templ_feats to all_chain_features. for type_chain_id in all_chain_features.keys(): From b3153de46b53518259e0335a7182aa121916c144 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:37:10 +0800 Subject: [PATCH 23/42] refactor: deduplicate code --- .../helixfold/data/pipeline_conf_bonds.py | 39 +++++++++++++++++-- .../helixfold/data/pipeline_token_feature.py | 14 ++----- .../infer_scripts/feature_processing_aa.py | 23 +++-------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index da80d9af..0da4a980 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -1,10 +1,19 @@ """Functions for building the input features (reference ccd features) for the HelixFold model.""" import collections -from typing import Optional +import gzip +import os +import pickle +from typing import Any, Optional + +from absl import logging +from immutabledict import immutabledict from helixfold.common import residue_constants import numpy as np +from helixfold.data.tools import utils + + ALLOWED_LIGAND_BONDS_TYPE = { "SING": 1, "DOUB": 2, @@ -13,6 +22,25 @@ "AROM": 12, } +def load_ccd_dict(ccd_preprocessed_path: str) -> immutabledict[str, Any]: + if not os.path.exists(ccd_preprocessed_path): + raise FileNotFoundError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.') + + if not ccd_preprocessed_path.endswith('.pkl.gz') and not ccd_preprocessed_path.endswith('.pkl'): + raise ValueError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not endswith .pkl.gz and .pkl') + + with utils.timing(f'Loading CCD dataset from {ccd_preprocessed_path}'): + if ccd_preprocessed_path.endswith('.pkl.gz'): + with gzip.open(ccd_preprocessed_path, "rb") as fp: + ccd_preprocessed_dict = immutabledict(pickle.load(fp)) + else: + with open(ccd_preprocessed_path, "rb") as fp: + ccd_preprocessed_dict = immutabledict(pickle.load(fp)) + + logging.info(f'CCD dataset contains {len(ccd_preprocessed_dict)} entries.') + + return ccd_preprocessed_dict + def element_map_with_x(atom_symbol): # ## one-hot max shape == 128 return residue_constants.ATOM_ELEMENT.get(atom_symbol, 127) @@ -107,8 +135,13 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, features[k] = np.concatenate(v, axis=0) features['ref_atom_count'] = np.bincount(features['ref_token2atom_idx']) - assert np.max(features['ref_element']) < 128 # WTF? - assert np.max(features['ref_atom_name_chars']) < 64 # WTF? + if (_ref_element:=np.max(features['ref_element'])) >= 128: + raise ValueError(f'ref_element= {_ref_element}, which is larger then 128.\n{features["ref_element"]}\n{"-"*79}') + + + if (_ref_atom_name_chars:=np.max(features['ref_atom_name_chars'])) >= 64: + raise ValueError(f'ref_atom_name_chars= {_ref_atom_name_chars}, which is larger then 64.\n{features["ref_atom_name_chars"]}\n{"-"*79}') + assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. # WTF? return features diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py index 6c16a2c4..92ea0e1e 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py @@ -7,11 +7,10 @@ from absl import logging from helixfold.common import residue_constants from helixfold.data import parsers -from helixfold.data.tools import utils +from helixfold.data.pipeline_conf_bonds import load_ccd_dict import numpy as np import json -import gzip -import pickle + from rdkit import Chem FeatureDict = MutableMapping[str, np.ndarray] @@ -212,14 +211,7 @@ def process(self, assembly_dict = unit_dict if ccd_preprocessed_dict is None: - ccd_preprocessed_dict = {} - - if not 'pkl.gz' in self.ccd_preprocessed_path: - raise ValueError(f'Invalid ccd_preprocessed_path: {self.ccd_preprocessed_path}') - - with utils.timing('Loading CCD dataset'): - with gzip.open(self.ccd_preprocessed_path, "rb") as fp: - ccd_preprocessed_dict = pickle.load(fp) + ccd_preprocessed_dict=load_ccd_dict(self.ccd_preprocessed_path) if select_mmcif_chainID is not None: select_mmcif_chainID = set(select_mmcif_chainID) diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py index cbec3867..fd875159 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py @@ -3,8 +3,9 @@ import copy import os from pathlib import Path -import time, gzip, pickle -from typing import Optional, Tuple +import pickle +from typing import Optional, Tuple, Any + import numpy as np from absl import logging @@ -17,6 +18,8 @@ from helixfold.data import pipeline_conf_bonds, pipeline_token_feature, pipeline_hybrid from helixfold.data import label_utils +from helixfold.data.tools import utils + from .preprocess import digit2alphabet @@ -24,20 +27,6 @@ STRING_FEATURES = ['all_chain_ids', 'all_ccd_ids','all_atom_ids', 'release_date','label_ccd_ids','label_atom_ids'] -def load_ccd_dict(ccd_preprocessed_path): - assert os.path.exists(ccd_preprocessed_path),\ - (f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.') - st_1 = time.time() - if 'pkl.gz' in ccd_preprocessed_path: - with gzip.open(ccd_preprocessed_path, "rb") as fp: - ccd_preprocessed_dict = pickle.load(fp) - elif '.pkl' in ccd_preprocessed_path: - with open(ccd_preprocessed_path, "rb") as fp: - ccd_preprocessed_dict = pickle.load(fp) - print(f'[CCD] load ccd dataset done. use {time.time()-st_1}s;'\ - f'Has length of {len(ccd_preprocessed_dict)}') - - return ccd_preprocessed_dict def crop_msa(feat, max_msa_depth=16384): @@ -385,7 +374,7 @@ def process_input_json(all_entitys, ccd_preprocessed_path, no_msa_templ_feats=False): ## load ccd dict. - ccd_preprocessed_dict = load_ccd_dict(ccd_preprocessed_path) + ccd_preprocessed_dict = pipeline_conf_bonds.load_ccd_dict(ccd_preprocessed_path) all_chain_features = {} sequence_features = {} num_chains = 0 From cd68f004e4a7345363e15f073f64974638ea217f Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:11:03 +0800 Subject: [PATCH 24/42] fix: hetatm input raise and dialognoses --- .../helixfold/data/pipeline_conf_bonds.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index 0da4a980..0973f48c 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -107,6 +107,13 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, features['ref_element'].append(np.array([element_map_with_x(t[0].upper() + t[1:].lower()) for t in _ccd_feats['atom_symbol']], dtype=np.int32)) features['ref_charge'].append(np.array(_ccd_feats['charge'], dtype=np.int32)) + + # atom checks + for atom_id in _ccd_feats['atom_ids']: + converted_atom_id_name=convert_atom_id_name(atom_id) + if max(converted_atom_id_name)>= 64: + raise ValueError(f'>>> Problematic atom in ligand ({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}') + features['ref_atom_name_chars'].append( np.array([convert_atom_id_name(atom_id) for atom_id in _ccd_feats['atom_ids']] , dtype=np.int32)) @@ -135,12 +142,8 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, features[k] = np.concatenate(v, axis=0) features['ref_atom_count'] = np.bincount(features['ref_token2atom_idx']) - if (_ref_element:=np.max(features['ref_element'])) >= 128: - raise ValueError(f'ref_element= {_ref_element}, which is larger then 128.\n{features["ref_element"]}\n{"-"*79}') - - - if (_ref_atom_name_chars:=np.max(features['ref_atom_name_chars'])) >= 64: - raise ValueError(f'ref_atom_name_chars= {_ref_atom_name_chars}, which is larger then 64.\n{features["ref_atom_name_chars"]}\n{"-"*79}') + if (len_ref_element:=np.max(features['ref_element'])) >= 128: + raise ValueError(f'{len_ref_element=}, which is larger then 128.\n{features["ref_element"]}\n{"-"*79}') assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. # WTF? return features From 7180fe7fedea5eea174081d7b210365f558c2d7b Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 20:02:23 +0800 Subject: [PATCH 25/42] fix: hetatm input raise from smiles --- .../helixfold3/helixfold/data/pipeline_conf_bonds.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index 0973f48c..ddba9822 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -45,13 +45,14 @@ def element_map_with_x(atom_symbol): # ## one-hot max shape == 128 return residue_constants.ATOM_ELEMENT.get(atom_symbol, 127) -def convert_atom_id_name(atom_id: str) -> int: +def convert_atom_id_name(atom_id: str) -> list[int]: """ Converts unique atom_id names to integer of atom_name. need to be padded to length 4. Each character is encoded as ord(c) − 32 """ + if (len_atom_id:=len(atom_id))>4: + raise ValueError(f'atom_id: `{atom_id}` is too long, max length is 4.') atom_id_pad = atom_id.ljust(4, ' ') - assert len(atom_id_pad) == 4 return [ord(c) - 32 for c in atom_id_pad] @@ -110,12 +111,13 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, # atom checks for atom_id in _ccd_feats['atom_ids']: - converted_atom_id_name=convert_atom_id_name(atom_id) + converted_atom_id_name=convert_atom_id_name(atom_id.upper()) if max(converted_atom_id_name)>= 64: raise ValueError(f'>>> Problematic atom in ligand ({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}') + logging.debug(f'({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}') features['ref_atom_name_chars'].append( - np.array([convert_atom_id_name(atom_id) for atom_id in _ccd_feats['atom_ids']] + np.array([convert_atom_id_name(atom_id.upper()) for atom_id in _ccd_feats['atom_ids']] , dtype=np.int32)) # here we get ref_space_uid [ Each (chain id, residue index) tuple is assigned an integer on first appearance.] From 330e4551c5485b9bf2d97abdc0fa78f9eda08d6d Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 22:05:07 +0800 Subject: [PATCH 26/42] feat: ligand: support obable readable files --- .../helixfold3/data/demo_p450_heme.json | 19 +++++ .../helixfold3/data/demo_p450_heme_sdf.json | 19 +++++ .../data/demo_p450_heme_smiles.json | 19 +++++ .../helixfold/infer_scripts/preprocess.py | 85 ++++++++++++++----- .../helixfold3/helixfold/inference.py | 11 ++- 5 files changed, 124 insertions(+), 29 deletions(-) create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme.json create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme.json b/apps/protein_folding/helixfold3/data/demo_p450_heme.json new file mode 100644 index 00000000..a414ef58 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_p450_heme.json @@ -0,0 +1,19 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP", + "count": 1 + }, + { + "type": "ligand", + "ccd": "HEM", + "count": 1 + }, + { + "type": "ligand", + "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C", + "count": 1 + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json new file mode 100644 index 00000000..72500c6e --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json @@ -0,0 +1,19 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP", + "count": 1 + }, + { + "type": "ligand", + "ccd": "HEM", + "count": 1 + }, + { + "type": "ligand", + "sdf": "/mnt/data/yinying/tests/helixfold/ligands/60119277-3d.sdf", + "count": 1 + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json new file mode 100644 index 00000000..cb6a22f5 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json @@ -0,0 +1,19 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP", + "count": 1 + }, + { + "type": "ligand", + "smiles": "CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)O)CCC(=O)O.[Fe+2]", + "count": 1 + }, + { + "type": "ligand", + "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C", + "count": 1 + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index eb8eb14f..f668f1bd 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -16,6 +16,8 @@ import subprocess import tempfile import itertools +from absl import logging +from typing import Tuple, Union, Mapping, Literal, Callable, Any import rdkit from rdkit import Chem from rdkit.Chem import AllChem @@ -138,26 +140,54 @@ def smiles_to_ETKDGMol(smiles): return optimal_mol_wo_H -def smiles_toMol_obabel(smiles): - """ - generate mol from smiles using obabel; - """ - - OBABEL_BIN = os.getenv('OBABEL_BIN') - if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)): - raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') - - with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: - print(f"[OBABEL] Temporary file created: {temp_file.name}") - obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d" +class Mol2MolObabel: + def __init__(self): + self.obabel_bin = os.getenv('OBABEL_BIN') + if not (self.obabel_bin and os.path.isfile(self.obabel_bin)): + raise FileNotFoundError(f'Cannot find obabel binary at {self.obabel_bin}.') + + # Get the supported formats + self.supported_formats: Tuple[str] = self._get_supported_formats() + + def _get_supported_formats(self) -> Tuple[str]: + """ + Retrieves the list of supported formats from obabel and filters out write-only formats. + + Returns: + tuple: A tuple of supported input formats. + """ + obabel_cmd = f"{self.obabel_bin} -L formats" ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) - mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) - if '3D coordinate generation failed' in ret.stderr: - mol = generate_ETKDGv3_conformer(mol) - optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) - return optimal_mol_wo_H + formats = [line.split()[0] for line in ret.stdout.splitlines() if '[Write-only]' not in line] + formats.append('smiles') + + return tuple(formats) + def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol: + with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: + print(f"[OBABEL] Temporary file created: {temp_file.name}") + if input_type == 'smiles': + obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d" + else: + obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d" + ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) + mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) + if '3D coordinate generation failed' in ret.stderr: + mol = generate_ETKDGv3_conformer(mol) + optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) + logging.debug(f'Converted `{input_type}`: {input_value}') + return optimal_mol_wo_H + + def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol: + if input_type not in self.supported_formats: + raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n') + + if input_type != 'smiles' and not os.path.isfile(input_value): + raise FileNotFoundError(f'Cannot find the {input_type.upper()} file at {input_value}.') + + return self._perform_conversion(input_type, input_value) + __call__: Callable[[str, str], Chem.Mol] = _convert_to_mol def polymer_convert(items): """ "type": "protein", @@ -192,7 +222,7 @@ def polymer_convert(items): } -def ligand_convert(items): +def ligand_convert(items: Mapping[str, Union[int, str]]): """ "type": "ligand", "ccd": "ATP", or "smiles": "CCccc(O)ccc", @@ -200,22 +230,31 @@ def ligand_convert(items): """ dtype = items['type'] count = items['count'] + converter=Mol2MolObabel() msa_seqs = "" _ccd_seqs = [] ccd_to_extra_mol_infos = {} if 'ccd' in items: _ccd_seqs.append(f"({items['ccd']})") - elif 'smiles' in items: - _ccd_seqs.append(f"(UNK-)") + + + elif any(f in items for f in converter.supported_formats): + for k in converter.supported_formats: + if k in items: + break + + ligand_name="UNK-" + _ccd_seqs.append(f"({ligand_name})") # mol_wo_h = smiles_to_ETKDGMol(items['smiles']) - mol_wo_h = smiles_toMol_obabel(items['smiles']) + + mol_wo_h = converter(k, items[k]) _extra_mol_infos = make_basic_info_fromMol(mol_wo_h) ccd_to_extra_mol_infos = { - "UNK-": _extra_mol_infos + ligand_name: _extra_mol_infos } else: - raise ValueError(f'not support for the {dtype} in ligand_convert') + raise ValueError(f'not support for the {dtype} in ligand_convert, please check the input. \nSupported input: {converter.supported_formats}') ccd_seqs = ''.join(_ccd_seqs) ## (GLY)(ALA)..... # repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs] diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 7734c964..76f40d96 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -102,16 +102,15 @@ def preprocess_json_entity(json_path, out_dir): def convert_to_json_compatible(obj): if isinstance(obj, np.ndarray): return obj.tolist() - elif isinstance(obj, np.integer): + if isinstance(obj, np.integer): return int(obj) - elif isinstance(obj, np.floating): + if isinstance(obj, np.floating): return float(obj) - elif isinstance(obj, dict): + if isinstance(obj, dict): return {k: convert_to_json_compatible(v) for k, v in obj.items()} - elif isinstance(obj, list): + if isinstance(obj, list): return [convert_to_json_compatible(i) for i in obj] - else: - return obj + return obj def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str: """Helper function to resolve the binary path.""" From dce79f8e787c83bfd379d65efa12b7204ad980e0 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 22:23:49 +0800 Subject: [PATCH 27/42] fix: output logs --- .../helixfold3/data/demo_3fap_protein_sm.json | 19 ++++++++++++++ .../helixfold/infer_scripts/preprocess.py | 26 ++++++++++--------- 2 files changed, 33 insertions(+), 12 deletions(-) create mode 100644 apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json diff --git a/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json new file mode 100644 index 00000000..6b2f9418 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json @@ -0,0 +1,19 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE", + "count": 1 + }, + { + "type": "protein", + "sequence": "VAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMERGPQTLKETSFNQAYGRDLMEAQEWCRKYMKSGNVKDLTQAWDLYYHVFRRIS", + "count": 1 + }, + { + "type": "ligand", + "smiles": "Cc1ccc(s1)[C@@H]\\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\\C)C)C)OC)O)\\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C", + "count": 1 + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index f668f1bd..6314fccd 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -22,6 +22,7 @@ from rdkit import Chem from rdkit.Chem import AllChem from helixfold.common import residue_constants +from helixfold.data.tools import utils ## NOTE: this mapping is only useful for standard dna/rna/protein sequence input. @@ -165,18 +166,19 @@ def _get_supported_formats(self) -> Tuple[str]: def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol: with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: - print(f"[OBABEL] Temporary file created: {temp_file.name}") - if input_type == 'smiles': - obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d" - else: - obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d" - ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) - mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) - if '3D coordinate generation failed' in ret.stderr: - mol = generate_ETKDGv3_conformer(mol) - optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) - logging.debug(f'Converted `{input_type}`: {input_value}') - return optimal_mol_wo_H + with utils.timing(f'converting {input_type} to mol2: {input_value}'): + if input_type == 'smiles': + obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d" + if len(input_value)>60: + logging.warning(f'This takes a while ...') + else: + obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d" + ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) + mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) + if '3D coordinate generation failed' in ret.stderr: + mol = generate_ETKDGv3_conformer(mol) + optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) + return optimal_mol_wo_H def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol: if input_type not in self.supported_formats: From 6968f690b223f40eeb681dcc5aa5227d4f145fa5 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:04:45 +0800 Subject: [PATCH 28/42] fix: template try-except bugs --- .../helixfold3/helixfold/data/pipeline_conf_bonds.py | 2 +- .../helixfold3/helixfold/data/templates.py | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index ddba9822..c8947643 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -114,7 +114,7 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, converted_atom_id_name=convert_atom_id_name(atom_id.upper()) if max(converted_atom_id_name)>= 64: raise ValueError(f'>>> Problematic atom in ligand ({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}') - logging.debug(f'({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}') + # logging.debug(f'({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}') features['ref_atom_name_chars'].append( np.array([convert_atom_id_name(atom_id.upper()) for atom_id in _ccd_feats['atom_ids']] diff --git a/apps/protein_folding/helixfold3/helixfold/data/templates.py b/apps/protein_folding/helixfold3/helixfold/data/templates.py index f2e3289a..f9efdae0 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/templates.py +++ b/apps/protein_folding/helixfold3/helixfold/data/templates.py @@ -817,19 +817,15 @@ def _process_single_hit( TemplateAtomMaskAllZerosError) as e: # These 3 errors indicate missing mmCIF experimental data rather than a # problem with the template search, so turn them into warnings. - warning = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: ' - '%s, mmCIF parsing errors: %s' - % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index, - str(e), parsing_result.errors)) + warning = (f'{hit_pdb_code}_{hit_chain_id} (sum_probs: {hit.sum_probs}, rank: {hit.index}): feature extracting errors: ' + f'{str(e)}, mmCIF parsing errors: {parsing_result.errors}') if strict_error_check: return SingleHitResult(features=None, error=warning, warning=None) else: return SingleHitResult(features=None, error=None, warning=warning) except Error as e: - error = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: ' - '%s, mmCIF parsing errors: %s' - % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index, - str(e), parsing_result.errors)) + error = (f'{hit_pdb_code}_{hit_chain_id} (sum_probs: {hit.sum_probs}, rank: {hit.index}): feature extracting errors: ' + f'{str(e)}, mmCIF parsing errors: {parsing_result.errors}') return SingleHitResult(features=None, error=error, warning=None) From 90f731a5900bfcfe5432bbe47f0265970ef9ed29 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:10:17 +0800 Subject: [PATCH 29/42] docs: sm file inputs --- apps/protein_folding/helixfold3/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index c8fc9c8c..639a46be 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -105,8 +105,8 @@ The script `scripts/download_all_data.sh` can be used to download and set up all There are some demo input under `./data/` for your test and reference. Data input is in the form of JSON containing several entities such as `protein`, `ligand`, `nucleic acids`, and `iron`. Proteins and nucleic acids inputs are their sequence. -HelixFold3 supports input ligand as SMILES or CCD id, please refer to `/data/demo_6zcy_smiles.json` and `demo_output/demo_6zcy_smiles/` -for more details about SMILES input. More flexible input will come in soon. +HelixFold3 supports input ligand as SMILES, CCD id or small molecule files, please refer to `/data/demo_6zcy_smiles.json` and `data/demo_p450_heme_sdf.json` +for more details about SMILES input. Flexible input from small molecule is now supported. See `obabel -L formats |grep -v 'Write-only'` A example of input data is as follows: ```json From 2bd81ad8d5a5d0aa4c23e3f4f9cfa78c249cc337 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:48:18 +0800 Subject: [PATCH 30/42] drop: buildin logger --- .../helixfold/data/mmcif_parsing_paddle.py | 2 +- .../helixfold3/helixfold/data/tools/utils.py | 2 +- .../helixfold/infer_scripts/preprocess.py | 29 ++++++++++--------- .../helixfold3/helixfold/inference.py | 1 - 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py index d5e8f875..2af97446 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py +++ b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py @@ -22,7 +22,7 @@ from Bio import PDB import numpy as np import time -import logging +from absl import logging from helixfold.common.residue_constants import crystallization_aids, ligand_exclusion_list # Type aliases: diff --git a/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py b/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py index c415dac8..c588187d 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py +++ b/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py @@ -16,7 +16,7 @@ import contextlib import shutil -import logging +from absl import logging import tempfile import time from typing import Optional diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 6314fccd..53a0e588 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -165,20 +165,21 @@ def _get_supported_formats(self) -> Tuple[str]: return tuple(formats) def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol: - with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: - with utils.timing(f'converting {input_type} to mol2: {input_value}'): - if input_type == 'smiles': - obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d" - if len(input_value)>60: - logging.warning(f'This takes a while ...') - else: - obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d" - ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) - mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) - if '3D coordinate generation failed' in ret.stderr: - mol = generate_ETKDGv3_conformer(mol) - optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) - return optimal_mol_wo_H + with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'): + if input_type == 'smiles': + obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d" + if len(input_value)>60: + logging.warning(f'This takes a while ...') + else: + obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d" + logging.debug(f'Launching command: `{obabel_cmd}`') + ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) + mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) + if '3D coordinate generation failed' in ret.stderr: + mol = generate_ETKDGv3_conformer(mol) + optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) + + return optimal_mol_wo_H def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol: if input_type not in self.supported_formats: diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 76f40d96..e18029a5 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -22,7 +22,6 @@ import pickle import pathlib import shutil -import logging import numpy as np import shutil From 07203113071ceb17a4f02d3cceee682d610a672b Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:54:05 +0800 Subject: [PATCH 31/42] drop: buildin logger --- .../helixfold3/helixfold/data/mmcif_parsing_paddle.py | 1 - apps/protein_folding/helixfold3/helixfold/utils/model.py | 1 - 2 files changed, 2 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py index 2af97446..d6f907eb 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py +++ b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py @@ -34,7 +34,6 @@ SeqRes = str MmCIFDict = Mapping[str, Sequence[str]] -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @dataclasses.dataclass(frozen=True) class Monomer: diff --git a/apps/protein_folding/helixfold3/helixfold/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py index 885510bd..1fcc53d4 100644 --- a/apps/protein_folding/helixfold3/helixfold/utils/model.py +++ b/apps/protein_folding/helixfold3/helixfold/utils/model.py @@ -17,7 +17,6 @@ import numpy as np import paddle import paddle.nn as nn -import logging import io from helixfold.model import modules_all_atom From d7bd9eeee7ea95ab9c57248853c6b214a2bd9909 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:18:53 +0800 Subject: [PATCH 32/42] add: use_3d opt for ligand --- .../helixfold3/data/demo_3fap_protein_sm.json | 3 ++- .../helixfold/infer_scripts/preprocess.py | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json index 6b2f9418..caf27f02 100644 --- a/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json +++ b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json @@ -12,7 +12,8 @@ }, { "type": "ligand", - "smiles": "Cc1ccc(s1)[C@@H]\\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\\C)C)C)OC)O)\\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C", + "sdf": "/mnt/data/yinying/tests/helixfold/ligands/ARD_ideal.sdf", + "use_3d": false, "count": 1 } ] diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 53a0e588..fa30f96a 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -164,14 +164,14 @@ def _get_supported_formats(self) -> Tuple[str]: return tuple(formats) - def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol: + def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol: with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'): if input_type == 'smiles': - obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d" + obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" if len(input_value)>60: logging.warning(f'This takes a while ...') else: - obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d" + obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" logging.debug(f'Launching command: `{obabel_cmd}`') ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) @@ -181,16 +181,16 @@ def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol: return optimal_mol_wo_H - def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol: + def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol: if input_type not in self.supported_formats: raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n') if input_type != 'smiles' and not os.path.isfile(input_value): raise FileNotFoundError(f'Cannot find the {input_type.upper()} file at {input_value}.') - return self._perform_conversion(input_type, input_value) + return self._perform_conversion(input_type, input_value, generate_3d) - __call__: Callable[[str, str], Chem.Mol] = _convert_to_mol + __call__: Callable[[str, str, bool], Chem.Mol] = _convert_to_mol def polymer_convert(items): """ "type": "protein", @@ -251,7 +251,7 @@ def ligand_convert(items: Mapping[str, Union[int, str]]): _ccd_seqs.append(f"({ligand_name})") # mol_wo_h = smiles_to_ETKDGMol(items['smiles']) - mol_wo_h = converter(k, items[k]) + mol_wo_h = converter(k, items[k], items.get('use_3d', True)) _extra_mol_infos = make_basic_info_fromMol(mol_wo_h) ccd_to_extra_mol_infos = { ligand_name: _extra_mol_infos From 96a3de830649536a5cf4efafad0341972e4230ca Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:07:17 +0800 Subject: [PATCH 33/42] fix: enable mutiple config overrides --- .../helixfold3/helixfold/config/helixfold.yaml | 2 +- .../helixfold3/helixfold/model/config.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index bc163019..6b40ea23 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -60,7 +60,7 @@ preset: # Other configurations other: - maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary + maxit_binary: /mnt/data/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary # CONFIG_DIFFS for advanced configuration diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py index f6d2fe97..f9dbbf1d 100644 --- a/apps/protein_folding/helixfold3/helixfold/model/config.py +++ b/apps/protein_folding/helixfold3/helixfold/model/config.py @@ -39,14 +39,17 @@ def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]] cfg.merge_with_dotlist(updated_config) print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}') - return cfg # update from detailed configuration if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM): - cfg.merge_with(config_diffs) # merge to override - print(f'Updated config from `CONFIG_DIFFS`: {config_diffs}') - return cfg + for root_kw in CONFIG_ALLATOM: + if root_kw not in config_diffs: + continue + cfg.merge_with(DictConfig({root_kw:config_diffs[root_kw]})) # merge to override + print(f'Updated config from `CONFIG_DIFFS`:{root_kw}: {config_diffs[root_kw]}') + + return cfg raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}') From 55e0c19124c655424b41f7ce132fe8262dc4f49f Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Fri, 23 Aug 2024 15:30:36 +0800 Subject: [PATCH 34/42] dev:major: covalent bond --- .../helixfold3/data/7s69_glycan.sdf | 155 ++++++++++++++++++ .../helixfold3/data/demo_7s69_coval.json | 19 +++ .../helixfold/data/pipeline_conf_bonds.py | 121 ++++++++++++-- .../infer_scripts/feature_processing_aa.py | 44 ++--- .../helixfold/infer_scripts/preprocess.py | 98 ++++++----- 5 files changed, 362 insertions(+), 75 deletions(-) create mode 100644 apps/protein_folding/helixfold3/data/7s69_glycan.sdf create mode 100644 apps/protein_folding/helixfold3/data/demo_7s69_coval.json diff --git a/apps/protein_folding/helixfold3/data/7s69_glycan.sdf b/apps/protein_folding/helixfold3/data/7s69_glycan.sdf new file mode 100644 index 00000000..8ac6a9e0 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/7s69_glycan.sdf @@ -0,0 +1,155 @@ + + OpenBabel03042416223D + + 72 77 0 0 1 0 0 0 0 0999 V2000 + 29.7340 3.2540 76.7430 C 0 0 0 0 0 2 0 0 0 0 0 0 + 29.8160 4.4760 77.6460 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.5260 5.2840 77.5530 C 0 0 2 0 0 3 0 0 0 0 0 0 + 28.1780 5.5830 76.1020 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.2350 4.3240 75.2420 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.1040 4.6170 73.7650 C 0 0 0 0 0 2 0 0 0 0 0 0 + 31.3020 3.8250 79.4830 C 0 0 0 0 0 0 0 0 0 0 0 0 + 31.3910 3.4410 80.9280 C 0 0 0 0 0 1 0 0 0 0 0 0 + 30.0760 4.0880 79.0210 N 0 0 0 0 0 2 0 0 0 0 0 0 + 28.6870 6.5050 78.2670 O 0 0 0 0 0 1 0 0 0 0 0 0 + 26.8490 6.0910 76.0350 O 0 0 0 0 0 0 0 0 0 0 0 0 + 29.4950 3.6650 75.4130 O 0 0 0 0 0 0 0 0 0 0 0 0 + 29.3670 4.5550 73.1150 O 0 0 0 0 0 1 0 0 0 0 0 0 + 32.2950 3.8940 78.7640 O 0 0 0 0 0 0 0 0 0 0 0 0 + 26.7420 7.4140 75.6950 C 0 0 1 0 0 3 0 0 0 0 0 0 + 25.2700 7.7830 75.6110 C 0 0 1 0 0 3 0 0 0 0 0 0 + 25.1290 9.2300 75.1610 C 0 0 2 0 0 3 0 0 0 0 0 0 + 25.9180 10.1440 76.0880 C 0 0 1 0 0 3 0 0 0 0 0 0 + 27.3630 9.6720 76.2210 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.1310 10.4360 77.2730 C 0 0 0 0 0 2 0 0 0 0 0 0 + 23.8820 5.8170 75.1400 C 0 0 0 0 0 0 0 0 0 0 0 0 + 23.1980 5.0100 74.0810 C 0 0 0 0 0 1 0 0 0 0 0 0 + 24.5530 6.8930 74.7160 N 0 0 0 0 0 2 0 0 0 0 0 0 + 23.7530 9.5950 75.1670 O 0 0 0 0 0 1 0 0 0 0 0 0 + 25.9170 11.4700 75.5730 O 0 0 0 0 0 0 0 0 0 0 0 0 + 27.4050 8.2900 76.6040 O 0 0 0 0 0 0 0 0 0 0 0 0 + 29.5300 10.4030 77.0280 O 0 0 0 0 0 1 0 0 0 0 0 0 + 23.8300 5.5110 76.3290 O 0 0 0 0 0 0 0 0 0 0 0 0 + 25.3940 12.4250 76.4090 C 0 0 1 0 0 3 0 0 0 0 0 0 + 25.9490 13.7680 75.9090 C 0 0 2 0 0 3 0 0 0 0 0 0 + 25.1320 14.9560 76.4900 C 0 0 2 0 0 3 0 0 0 0 0 0 + 23.6130 14.6900 76.6390 C 0 0 1 0 0 3 0 0 0 0 0 0 + 23.3700 13.3000 77.2280 C 0 0 1 0 0 3 0 0 0 0 0 0 + 21.9020 12.9360 77.3500 C 0 0 0 0 0 2 0 0 0 0 0 0 + 25.9010 13.8490 74.4810 O 0 0 0 0 0 1 0 0 0 0 0 0 + 25.3420 16.1410 75.7110 O 0 0 0 0 0 0 0 0 0 0 0 0 + 23.0420 15.6520 77.5170 O 0 0 0 0 0 1 0 0 0 0 0 0 + 23.9910 12.3690 76.3570 O 0 0 0 0 0 0 0 0 0 0 0 0 + 21.3660 12.8480 76.0500 O 0 0 0 0 0 0 0 0 0 0 0 0 + 20.8090 11.6500 75.6780 C 0 0 2 0 0 3 0 0 0 0 0 0 + 20.6800 11.6410 74.1740 C 0 0 2 0 0 3 0 0 0 0 0 0 + 19.5510 12.5850 73.8180 C 0 0 2 0 0 3 0 0 0 0 0 0 + 18.2370 12.0940 74.4540 C 0 0 1 0 0 3 0 0 0 0 0 0 + 18.4030 11.9240 75.9810 C 0 0 1 0 0 3 0 0 0 0 0 0 + 17.2710 11.1260 76.6120 C 0 0 0 0 0 2 0 0 0 0 0 0 + 20.2900 10.3510 73.7080 O 0 0 0 0 0 1 0 0 0 0 0 0 + 19.4280 12.7380 72.4110 O 0 0 0 0 0 0 0 0 0 0 0 0 + 17.2120 13.0460 74.2030 O 0 0 0 0 0 1 0 0 0 0 0 0 + 19.6260 11.2000 76.3010 O 0 0 0 0 0 0 0 0 0 0 0 0 + 16.0670 11.4490 75.9360 O 0 0 0 0 0 1 0 0 0 0 0 0 + 20.2190 13.6280 71.7260 C 0 0 2 0 0 3 0 0 0 0 0 0 + 19.6090 14.0000 70.3810 C 0 0 2 0 0 3 0 0 0 0 0 0 + 19.6360 12.7820 69.4880 C 0 0 2 0 0 3 0 0 0 0 0 0 + 21.0860 12.3100 69.3240 C 0 0 1 0 0 3 0 0 0 0 0 0 + 21.7030 12.0240 70.7120 C 0 0 1 0 0 3 0 0 0 0 0 0 + 23.1940 11.7460 70.6620 C 0 0 0 0 0 2 0 0 0 0 0 0 + 20.4080 14.9810 69.7000 O 0 0 0 0 0 1 0 0 0 0 0 0 + 19.0310 13.0500 68.2340 O 0 0 0 0 0 1 0 0 0 0 0 0 + 21.1060 11.1280 68.5380 O 0 0 0 0 0 1 0 0 0 0 0 0 + 21.5380 13.1700 71.5840 O 0 0 0 0 0 0 0 0 0 0 0 0 + 23.8240 12.5210 71.6820 O 0 0 0 0 0 1 0 0 0 0 0 0 + 26.0070 17.3020 76.0200 C 0 0 2 0 0 3 0 0 0 0 0 0 + 27.0750 17.5250 74.9350 C 0 0 2 0 0 3 0 0 0 0 0 0 + 28.3660 16.8320 75.3290 C 0 0 2 0 0 3 0 0 0 0 0 0 + 28.7820 17.2470 76.7510 C 0 0 1 0 0 3 0 0 0 0 0 0 + 27.6930 16.8120 77.7320 C 0 0 1 0 0 3 0 0 0 0 0 0 + 27.9770 17.2020 79.1710 C 0 0 0 0 0 2 0 0 0 0 0 0 + 27.3990 18.9140 74.8010 O 0 0 0 0 0 1 0 0 0 0 0 0 + 29.4060 17.0990 74.3950 O 0 0 0 0 0 1 0 0 0 0 0 0 + 30.0160 16.6410 77.0930 O 0 0 0 0 0 1 0 0 0 0 0 0 + 26.4610 17.4820 77.3520 O 0 0 0 0 0 0 0 0 0 0 0 0 + 27.3660 18.4620 79.4040 O 0 0 0 0 0 1 0 0 0 0 0 0 + 1 2 1 0 0 0 0 + 1 12 1 0 0 0 0 + 2 3 1 0 0 0 0 + 2 9 1 1 0 0 0 + 3 10 1 1 0 0 0 + 3 4 1 0 0 0 0 + 4 5 1 0 0 0 0 + 4 11 1 1 0 0 0 + 5 6 1 6 0 0 0 + 5 12 1 0 0 0 0 + 6 13 1 0 0 0 0 + 7 14 2 0 0 0 0 + 7 8 1 0 0 0 0 + 7 9 1 0 0 0 0 + 15 16 1 0 0 0 0 + 15 11 1 1 0 0 0 + 15 26 1 0 0 0 0 + 16 23 1 6 0 0 0 + 16 17 1 0 0 0 0 + 17 18 1 0 0 0 0 + 17 24 1 1 0 0 0 + 18 25 1 6 0 0 0 + 18 19 1 0 0 0 0 + 19 20 1 1 0 0 0 + 19 26 1 0 0 0 0 + 20 27 1 0 0 0 0 + 21 22 1 0 0 0 0 + 21 23 1 0 0 0 0 + 21 28 2 0 0 0 0 + 29 38 1 0 0 0 0 + 29 25 1 6 0 0 0 + 29 30 1 0 0 0 0 + 30 35 1 6 0 0 0 + 30 31 1 0 0 0 0 + 31 32 1 0 0 0 0 + 31 36 1 6 0 0 0 + 32 33 1 0 0 0 0 + 32 37 1 1 0 0 0 + 33 38 1 0 0 0 0 + 33 34 1 6 0 0 0 + 34 39 1 0 0 0 0 + 40 49 1 0 0 0 0 + 40 41 1 0 0 0 0 + 40 39 1 1 0 0 0 + 41 46 1 1 0 0 0 + 41 42 1 0 0 0 0 + 42 43 1 0 0 0 0 + 42 47 1 6 0 0 0 + 43 48 1 1 0 0 0 + 43 44 1 0 0 0 0 + 44 49 1 0 0 0 0 + 44 45 1 6 0 0 0 + 45 50 1 0 0 0 0 + 51 47 1 6 0 0 0 + 51 60 1 0 0 0 0 + 51 52 1 0 0 0 0 + 52 53 1 0 0 0 0 + 52 57 1 6 0 0 0 + 53 54 1 0 0 0 0 + 53 58 1 6 0 0 0 + 54 59 1 6 0 0 0 + 54 55 1 0 0 0 0 + 55 56 1 6 0 0 0 + 55 60 1 0 0 0 0 + 56 61 1 0 0 0 0 + 62 71 1 0 0 0 0 + 62 36 1 1 0 0 0 + 62 63 1 0 0 0 0 + 63 68 1 1 0 0 0 + 63 64 1 0 0 0 0 + 64 69 1 6 0 0 0 + 64 65 1 0 0 0 0 + 65 70 1 1 0 0 0 + 65 66 1 0 0 0 0 + 66 67 1 1 0 0 0 + 66 71 1 0 0 0 0 + 67 72 1 0 0 0 0 +M END +$$$$ diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json new file mode 100644 index 00000000..fc53c9e9 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json @@ -0,0 +1,19 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "DRHHHHHHKLGKMKIVEEPNSFGLNNPFLSQTNKLQPRVQPSPVSGPSHLFRLAGKCFNLVESTYKYELCPFHNVTQHEQTFRWNAYSGILGIWQEWDIENNTFSGMWMREGDSCGNKNRQTKVLLVCGKANKLSSVSEPSTCLYSLTFETPLVCHPHSLLVYPTLSEGLQEKWNEAEQALYDELITEQGHGKILKEIFREAGYLKTTKPDGEGKETQDKPKEFDSLEKCNKGYTELTSEIQRLKKMLNEHGISYVTNGTSRSEGQPAEVNTTFARGEDKVHLRGDTGIRDGQ", + "count": 1 + }, + { + "type": "ligand", + "sdf": "/repo/PaddleHelix/apps/protein_folding/helixfold3/data/7s69_glycan.sdf", + "count": 1 + }, + { + "type": "bond", + "bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3", + "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA" + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index c8947643..7a9c2960 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -1,19 +1,23 @@ """Functions for building the input features (reference ccd features) for the HelixFold model.""" import collections +from dataclasses import dataclass import gzip import os import pickle -from typing import Any, Optional +import re +from typing import Any, List, Literal, Optional, Tuple from absl import logging from immutabledict import immutabledict -from helixfold.common import residue_constants import numpy as np +from openbabel import openbabel +from helixfold.common import residue_constants from helixfold.data.tools import utils + ALLOWED_LIGAND_BONDS_TYPE = { "SING": 1, "DOUB": 2, @@ -22,6 +26,98 @@ "AROM": 12, } +# Define the possible bond types as a Literal +BondType = Literal["covale", "metal", "hydrogen", "ionic", "disulfide", "aromatic"] + + +@dataclass +class AtomPartner: + """ + Represents one partner atom in a covalent bond. + + Attributes: + label_asym_id (str): The asymmetry identifier for the partner atom (i.e., chain ID). + label_comp_id (str): The component identifier for the partner atom (i.e., residue name). + seq_id (str): The sequence identifier for the partner atom (merged label_seq_id and auth_seq_id). + label_atom_id (str): The atom identifier for the partner atom (i.e., atom name). + """ + + label_asym_id: str # Chain ID + label_comp_id: str # Residue name + seq_id: str # Merged sequence ID + label_atom_id: str # Atom name + + +@dataclass +class CovalentBond: + """ + Represents a covalent bond between two atoms in a molecular structure. + + Attributes: + atom_1 (AtomPartner): The first partner atom in the bond. + atom_2 (AtomPartner): The second partner atom in the bond. + bond_type (BondType): The type of the bond. + pdbx_dist_value (float): The distance value as defined in the PDBx/mmCIF format. + """ + + atom_1: AtomPartner + atom_2: AtomPartner + bond_type: BondType + pdbx_dist_value: float + +def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]: + """ + Parses a human-readable string into a list of CovalentBond objects. + + Args: + input_string (str): A string representing covalent bonds, where each bond is + described by two atom partners separated by a comma, + and multiple bonds are separated by semicolons. + Example: "A,GLY,25,CA,A,GLY,25,N,covale,1.32; B,HIS,58,ND1,B,HIS,58,CE1,covale,1.39" + + Returns: + List[CovalentBond]: A list of CovalentBond objects. + """ + covalent_bonds = [] + + # Split the input string by semicolons to separate individual covalent bonds + bond_strings = input_string.split(';') + + for bond_str in bond_strings: + # Split the individual bond string by commas to separate attributes + bond_parts = bond_str.split(',') + + if len(bond_parts) != 10: + raise ValueError(f"Invalid bond format: {bond_str}. Expected 10 fields per bond.") + + # Create AtomPartner instances for the two atoms in the bond + atom_1 = AtomPartner( + label_asym_id=bond_parts[0].strip(), + label_comp_id=bond_parts[1].strip(), + seq_id=bond_parts[2].strip(), + label_atom_id=bond_parts[3].strip() + ) + + atom_2 = AtomPartner( + label_asym_id=bond_parts[4].strip(), + label_comp_id=bond_parts[5].strip(), + seq_id=bond_parts[6].strip(), + label_atom_id=bond_parts[7].strip() + ) + + # Create a CovalentBond instance + covalent_bond = CovalentBond( + atom_1=atom_1, + atom_2=atom_2, + bond_type=bond_parts[8].strip(), + pdbx_dist_value=float(bond_parts[9].strip()) + ) + + # Append the CovalentBond instance to the list + covalent_bonds.append(covalent_bond) + + return covalent_bonds + def load_ccd_dict(ccd_preprocessed_path: str) -> immutabledict[str, Any]: if not os.path.exists(ccd_preprocessed_path): raise FileNotFoundError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.') @@ -151,7 +247,7 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict, return features -def make_bond_features(covalent_bond, all_chain_info, ccd_preprocessed_dict, +def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_preprocessed_dict, extra_feats: Optional[dict]=None): """ all_chain_info: dict, (chain_type_chain_id): ccd_seq (list of ccd), such as: protein_A: ['ALA', 'MET', 'GLY'] @@ -172,24 +268,29 @@ def make_bond_features(covalent_bond, all_chain_info, ccd_preprocessed_dict, _set_chain_id_list = set(chain_id_list) parsed_covalent_bond = [] for _bond in covalent_bond: - left_bond_atomid, right_bond_atomid = _bond['ptnr1_label_atom_id'], _bond['ptnr2_label_atom_id'] - left_bond_name, right_bond_name = _bond['ptnr1_label_comp_id'], _bond['ptnr2_label_comp_id'] - left_bond, right_bond = _bond['ptnr1_label_asym_id'], _bond['ptnr2_label_asym_id'] + # Accessing the AtomPartner attributes for both atoms in the covalent bond + left_bond_atomid, right_bond_atomid = _bond.atom_1.label_atom_id, _bond.atom_2.label_atom_id + left_bond_name, right_bond_name = _bond.atom_1.label_comp_id, _bond.atom_2.label_comp_id + left_bond, right_bond = _bond.atom_1.label_asym_id, _bond.atom_2.label_asym_id - left_bond_idx, right_bond_idx = _bond['ptnr1_label_seq_id'], _bond['ptnr2_label_seq_id'] - auth_left_idx, auth_right_idx = _bond['ptnr1_auth_seq_id'], _bond['ptnr2_auth_seq_id'] + left_bond_idx, right_bond_idx = _bond.atom_1.seq_id, _bond.atom_2.seq_id + auth_left_idx, auth_right_idx = _bond.atom_1.seq_id, _bond.atom_2.seq_id + left_bond_idx = 1 if left_bond_idx == '.' else left_bond_idx right_bond_idx = 1 if right_bond_idx == '.' else right_bond_idx - if _bond['bond_type'] != "covale": + if _bond.bond_type != "covale": + logging.warning(f'Ignore non-covale bond type: {_bond.bond_type}') continue - if _bond['pdbx_dist_value'] > 2.4: + if _bond.pdbx_dist_value > 2.4: # the covalent_bond is cut off by distance=2.4 + logging.warning(f'Ignore bonding with distance > 2.4: {_bond.pdbx_dist_value}') continue ## When some chainID is filtered, bond need to be filtered too. if (left_bond not in _set_chain_id_list) or (right_bond not in _set_chain_id_list): + logging.warning(f'Ignore bonding with left and right out of chain list: ') continue parsed_covalent_bond.append([left_bond, left_bond_name, left_bond_idx, left_bond_atomid, auth_left_idx, diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py index fd875159..37b09a99 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py @@ -4,7 +4,7 @@ import os from pathlib import Path import pickle -from typing import Optional, Tuple, Any +from typing import List, Mapping, Optional, Tuple import numpy as np from absl import logging @@ -20,7 +20,7 @@ from helixfold.data.tools import utils -from .preprocess import digit2alphabet +from .preprocess import Entity, digit2alphabet POLYMER_STANDARD_RESI_ATOMS = residue_constants.residue_atoms @@ -226,7 +226,7 @@ def get_inference_restype_mask(all_chain_features, ccd_preprocessed_dict, extra_ } -def add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats=True): +def add_assembly_features(all_chain_features: Mapping, ccd_preprocessed_dict: Mapping, no_msa_templ_feats:bool=True, covalent_bonds:Optional[List[pipeline_conf_bonds.CovalentBond]]=None): ''' ## NOTE: keep the type and chainID orders. all_chain_features: { @@ -300,7 +300,7 @@ def add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_temp ref_features = pipeline_conf_bonds.make_ccd_conf_features(all_chain_info=new_order_chain_infos, ccd_preprocessed_dict=ccd_preprocessed_dict, extra_feats=extra_feats_infos) - bond_features = pipeline_conf_bonds.make_bond_features(covalent_bond=[], + bond_features = pipeline_conf_bonds.make_bond_features(covalent_bond=covalent_bonds, all_chain_info=new_order_chain_infos, ccd_preprocessed_dict=ccd_preprocessed_dict, extra_feats=extra_feats_infos) @@ -369,7 +369,7 @@ def process_chain_msa(args: tuple[pipeline_multimer_parallel.DataPipeline, str, return chain_id, raw_features, desc, seq -def process_input_json(all_entitys, ccd_preprocessed_path, +def process_input_json(all_entitys: List[Entity], ccd_preprocessed_path, msa_templ_data_pipeline_dict, msa_output_dir, no_msa_templ_feats=False): @@ -378,31 +378,27 @@ def process_input_json(all_entitys, ccd_preprocessed_path, all_chain_features = {} sequence_features = {} num_chains = 0 - for entity_items in all_entitys: + for entity in all_entitys: + if (dtype:=entity.dtype) not in residue_constants.CHAIN_type_order: + continue # dtype(protein, dna, rna, ligand): no_chains, msa_seqs, seqs - dtype = list(entity_items.keys())[0] - items = list(entity_items.values())[0] - entity_count = items['count'] - ccd_seqs = items['seqs'] - msa_seqs = items['msa_seqs'] - extra_mol_infos = items.get('extra_mol_infos', {}) ## dict, 「extra-add, ccd_id」: ccd_features. - - for i in range(entity_count): + + for i in range(entity.count): chain_num_ids = num_chains + i chain_id = digit2alphabet(chain_num_ids) # increase ++ type_chain_id = dtype + '_' + chain_id - if ccd_seqs in sequence_features: - all_chain_features[type_chain_id] = copy.deepcopy(sequence_features[ccd_seqs]) + if entity.seqs in sequence_features: + all_chain_features[type_chain_id] = copy.deepcopy(sequence_features[entity.seqs]) continue - ccd_list = parsers.parse_ccd_fasta(ccd_seqs) + ccd_list = parsers.parse_ccd_fasta(entity.seqs) chain_features = {'msa_templ_feats': {}, 'ccd_seqs': ccd_list, - 'msa_seqs': msa_seqs, - 'extra_feats': extra_mol_infos} + 'msa_seqs': entity.msa_seqs, + 'extra_feats': entity.extra_mol_infos} all_chain_features[type_chain_id] = chain_features - sequence_features[ccd_seqs] = chain_features - num_chains += entity_count + sequence_features[entity.seqs] = chain_features + num_chains += entity.count if not no_msa_templ_feats: ## 1. get all msa_seqs for protein/rna MSA/Template search. Only for protein/rna. @@ -456,8 +452,12 @@ def process_input_json(all_entitys, ccd_preprocessed_path, for _type_chain_id in fasta_seq_to_type_chain_id[fasta_seq]: chain_features['msa_templ_feats'] = copy.deepcopy(seqs_to_msa_features[fasta_seq]) + + # gather all defined covalent bonds + all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if bond.dtype == 'bond'] + assert num_chains == len(all_chain_features.keys()) - all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats) + all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats, all_covalent_bonds) np_example, label = all_feats['feats'], all_feats['label'] assert num_chains == len(np.unique(np_example['all_chain_ids'])) diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index fa30f96a..6aa66b55 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -17,13 +17,17 @@ import tempfile import itertools from absl import logging -from typing import Tuple, Union, Mapping, Literal, Callable, Any +from typing import List, Optional, Tuple, Union, Mapping, Literal, Callable, Any +from dataclasses import dataclass, field import rdkit from rdkit import Chem from rdkit.Chem import AllChem from helixfold.common import residue_constants +from helixfold.data.pipeline_conf_bonds import CovalentBond, parse_covalent_bond_input from helixfold.data.tools import utils +from openbabel import openbabel + ## NOTE: this mapping is only useful for standard dna/rna/protein sequence input. # protein, rna, dna, ligand, non_polymer (ion and non_polymer is also the ligand.) @@ -163,8 +167,19 @@ def _get_supported_formats(self) -> Tuple[str]: formats.append('smiles') return tuple(formats) + + def _load_mol(self, mol2_file:str, ret:Optional[subprocess.CompletedProcess]=None) -> Chem.Mol: + mol = Chem.MolFromMol2File(mol2_file, sanitize=False) + if isinstance(ret, subprocess.CompletedProcess) and '3D coordinate generation failed' in ret.stderr: + mol = generate_ETKDGv3_conformer(mol) + optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) + + return optimal_mol_wo_H def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol: + if input_type == 'mol2' and input_value.endswith('.mol2'): + return self._load_mol(mol2_file=input_value) + with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'): if input_type == 'smiles': obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" @@ -174,13 +189,8 @@ def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bo obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" logging.debug(f'Launching command: `{obabel_cmd}`') ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) - mol = Chem.MolFromMol2File(temp_file.name, sanitize=False) - if '3D coordinate generation failed' in ret.stderr: - mol = generate_ETKDGv3_conformer(mol) - optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False) - - return optimal_mol_wo_H - + return self._load_mol(mol2_file=temp_file.name, ret=ret) + def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol: if input_type not in self.supported_formats: raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n') @@ -191,7 +201,16 @@ def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=T return self._perform_conversion(input_type, input_value, generate_3d) __call__: Callable[[str, str, bool], Chem.Mol] = _convert_to_mol -def polymer_convert(items): + +@dataclass +class Entity: + dtype: Literal['protein', 'dna', 'rna', 'ligand', 'bond','non_polymer', 'ion'] + seqs: str + msa_seqs: Union[str, List[CovalentBond]] = '' + count: int = 1 + extra_mol_infos: dict[str, Any] = field(default_factory=dict) + +def polymer_convert(items)-> Entity: """ "type": "protein", "sequence": "GPDSMEEVVVPEEPPKLVSALATYVQQERLCTMFLSIANKLLPLKP", @@ -214,18 +233,21 @@ def polymer_convert(items): raise ValueError(f'not support for the {dtype} in polymer_convert') ccd_seqs = ''.join(ccd_seqs) ## (GLY)(ALA)..... - # repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs] - return { - dtype: { - 'seqs': ccd_seqs, - 'msa_seqs': msa_seqs, - 'count': count, - 'extra_mol_infos': {} - } - } + return Entity(dtype=dtype, seqs=ccd_seqs, msa_seqs=msa_seqs,count=count) -def ligand_convert(items: Mapping[str, Union[int, str]]): +def covalent_bond_convert(items: Mapping[str, Union[int, str]]) -> Entity: + """ + "type": "bond", + "bond": "A,ASN,74,ND2,B,UNK-," + """ + dtype = items['type'] + bond = parse_covalent_bond_input(items['bond']) + + return Entity(dtype=dtype, seqs='', msa_seqs=bond) + + +def ligand_convert(items: Mapping[str, Union[int, str]]) -> Entity: """ "type": "ligand", "ccd": "ATP", or "smiles": "CCccc(O)ccc", @@ -261,14 +283,8 @@ def ligand_convert(items: Mapping[str, Union[int, str]]): ccd_seqs = ''.join(_ccd_seqs) ## (GLY)(ALA)..... # repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs] - return { - 'ligand': { - 'seqs': ccd_seqs, - 'msa_seqs': msa_seqs, - 'count': count, - 'extra_mol_infos': ccd_to_extra_mol_infos, - } - } + return Entity(dtype='ligand', seqs=ccd_seqs,msa_seqs=msa_seqs,count=count,extra_mol_infos=ccd_to_extra_mol_infos) + def entities_rename_and_filter(items): @@ -276,39 +292,34 @@ def entities_rename_and_filter(items): 'ion': 'ligand' } items['type'] = ligand_mapping.get(items['type'], items['type']) - if items['type'] not in ALLOWED_ENTITY_TYPE: + if items['type'] not in ALLOWED_ENTITY_TYPE and items['type'] != 'bond': raise ValueError(f'{items["type"]} is not allowed, will be ignored.') return items -def modify_name_convert(entities: list): +def modify_name_convert(entities: list[Entity]): cur_idx = 0 - for entity_items in entities: + for entity in entities: # dtype(protein, dna, rna, ligand): no_chains, msa_seqs, seqs - dtype = list(entity_items.keys())[0] - items = list(entity_items.values())[0] - entity_count = items['count'] - msa_seqs = items['msa_seqs'] - extra_mol_infos = items.get('extra_mol_infos', {}) ## dict, 「extra-add, ccd_id」: ccd_features. - extra_ccd_ids = list(extra_mol_infos.keys()) ## rename UNK- to UNK-1, 2, 3, 4... - for k in extra_ccd_ids: + extra_mol_infos=copy.deepcopy(entity.extra_mol_infos) + for k in extra_mol_infos: user_name_3 = USER_LIG_IDS_3[cur_idx] - items['seqs'] = items['seqs'].replace('UNK-', user_name_3) - extra_mol_infos[user_name_3] = extra_mol_infos.pop('UNK-') + entity.seqs = entity.seqs.replace('UNK-', user_name_3) + entity.extra_mol_infos[user_name_3] = entity.extra_mol_infos.pop('UNK-') cur_idx += 1 return entities -def online_json_to_entity(json_path, out_dir): +def online_json_to_entity(json_path: str, out_dir: str)-> list[Entity]: obj = read_json(json_path) entities = copy.deepcopy(obj['entities']) os.makedirs(out_dir, exist_ok=True) error_ids = [] - success_entity = [] + success_entity: list[Entity] = [] for idx, items in enumerate(entities): try: items = entities_rename_and_filter(items) @@ -320,6 +331,8 @@ def online_json_to_entity(json_path, out_dir): try: if items['type'] == 'ligand': json_obj = ligand_convert(items) + elif items['type'] == 'bond': + json_obj = covalent_bond_convert(items) else: json_obj = polymer_convert(items) success_entity.append(json_obj) @@ -334,5 +347,4 @@ def online_json_to_entity(json_path, out_dir): if len(error_ids) > 0: raise RuntimeError(f'[Error] Failed to convert {len(error_ids)}/{len(entities)} entities') - success_entity = modify_name_convert(success_entity) - return success_entity \ No newline at end of file + return modify_name_convert(success_entity) \ No newline at end of file From 02a88396273c3ed8de42a9fc8491ad800502c39c Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:54:33 +0800 Subject: [PATCH 35/42] fix: covalent bond --- .../helixfold3/data/demo_7s69_coval.json | 2 +- .../helixfold/data/pipeline_conf_bonds.py | 42 ++++++++++++++++--- .../infer_scripts/feature_processing_aa.py | 2 +- .../helixfold/infer_scripts/preprocess.py | 12 ++++-- 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json index fc53c9e9..a3890224 100644 --- a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json +++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json @@ -12,7 +12,7 @@ }, { "type": "bond", - "bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3", + "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3", "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA" } ] diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index 7a9c2960..9e5b7d2e 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -115,6 +115,7 @@ def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]: # Append the CovalentBond instance to the list covalent_bonds.append(covalent_bond) + logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}") return covalent_bonds @@ -306,6 +307,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr chainId_to_type = {} ligand_bond_type = [] # (i, j, bond_type), represent the bond between token i and token j bond_index = [] # (i,j) represent the bond between token i and token j + + ccd_id2atom_ids: dict[str, list]={} ccd_standard_set = residue_constants.STANDARD_LIST for chain_type_id, ccd_seq in all_chain_info.items(): chain_type, chain_id = chain_type_id.rsplit('_', 1) @@ -326,6 +329,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr else: _ccd_feats = ccd_preprocessed_dict[ccd_id] atom_ids = _ccd_feats['atom_ids'] + + ccd_id2atom_ids[ccd_id] = atom_ids assert len(atom_ids) > 0, f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.' all_token_nums += len(atom_ids) @@ -372,16 +377,43 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr ptnr2_label_seq_id = ptnr2_auth_seq_id try: - assert ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list + if not (ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list): + raise ValueError(f"Invalid chain id:\n{ptnr1_label_asym_id}/{ptnr2_label_asym_id}\n{chainId_to_ccd_list}") ptnr1_ccd_id = chainId_to_ccd_list[ptnr1_label_asym_id][int(ptnr1_label_seq_id) - 1] ptnr2_ccd_id = chainId_to_ccd_list[ptnr2_label_asym_id][int(ptnr2_label_seq_id) - 1] - assert ptnr1_ccd_id == ptnr1_label_comp_id and ptnr2_ccd_id == ptnr2_label_comp_id - except: + + + # renamed ligand residues + + + if ptnr1_ccd_id != ptnr1_label_comp_id: + logging.warning(f"Find ligand residue: {ptnr1_label_comp_id} -> {ptnr1_ccd_id}") + #ptnr1_label_comp_id = ptnr1_ccd_id + + if ptnr2_ccd_id != ptnr2_label_comp_id: + logging.warning(f"Find ligand residue: {ptnr2_label_comp_id} -> {ptnr2_ccd_id}") + #ptnr2_label_comp_id = ptnr2_ccd_id + + except ValueError as e: ## some convalent-bond from mmcif is misslead, pass it. + logging.warning(f'Error occurred during covalent bond processing: {e}') continue + + - ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids'] - ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids'] + if ptnr1_ccd_id in ccd_preprocessed_dict: + ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids'] + else: + ptnr1_ccd_atoms_list = ccd_id2atom_ids[ptnr1_ccd_id] + + logging.debug(f'{ptnr1_ccd_id=}: {ptnr1_ccd_atoms_list=}') + + if ptnr2_ccd_id in ccd_preprocessed_dict: + ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids'] + else: + ptnr2_ccd_atoms_list = ccd_id2atom_ids[ptnr2_ccd_id] + + logging.debug(f'{ptnr2_ccd_id=}: {ptnr2_ccd_atoms_list=}') if ptnr1_ccd_id in ccd_standard_set: ## if ccd_id is in the standard residue in HF3 (table 13), we didn't have to map to atom-leval index diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py index 37b09a99..31272c32 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py @@ -454,7 +454,7 @@ def process_input_json(all_entitys: List[Entity], ccd_preprocessed_path, # gather all defined covalent bonds - all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if bond.dtype == 'bond'] + all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if entity.dtype == 'bond'] assert num_chains == len(all_chain_features.keys()) all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats, all_covalent_bonds) diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 6aa66b55..2fffbb96 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -180,16 +180,20 @@ def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bo if input_type == 'mol2' and input_value.endswith('.mol2'): return self._load_mol(mol2_file=input_value) - with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'): + save_path=os.path.join('ligands',f'{os.path.basename(input_value)[:-(len(input_type)+1)] if input_type != "smiles" else "UNK"}.mol2') + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + with utils.timing(f'converting {input_type} to mol2: {input_value}'): if input_type == 'smiles': - obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" + obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}" if len(input_value)>60: logging.warning(f'This takes a while ...') else: - obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" + obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}" logging.debug(f'Launching command: `{obabel_cmd}`') ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) - return self._load_mol(mol2_file=temp_file.name, ret=ret) + return self._load_mol(mol2_file=save_path, ret=ret) def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol: if input_type not in self.supported_formats: From 2ed4486fd5c9041253cb4be9dd3db11b3deff85c Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Fri, 23 Aug 2024 22:54:03 +0800 Subject: [PATCH 36/42] doc: covalently --- apps/protein_folding/helixfold3/README.md | 43 +++++++++++++++++++ .../helixfold3/data/demo_7s69_coval.json | 3 +- .../helixfold3/data/demo_p450_heme_coval.json | 25 +++++++++++ .../helixfold3/helixfold/inference.py | 20 +++++++++ .../protein_folding/helixfold3/pyproject.toml | 1 + 5 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index 639a46be..0eaa1512 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -126,6 +126,49 @@ A example of input data is as follows: } ``` +Another example of **covalently modified** input: +```json +{ + "entities": [ + { + "type": "protein", + "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP", + "count": 1 + }, + { + "type": "ligand", + "ccd": "HEM", + "count": 1 + }, + { + "type": "ligand", + "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C", + "count": 1 + }, + { + "type": "bond", + "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3", + "_comment": ",,,,,,,,,", + "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-` where index is the chain order you input" + } + ] +} +``` + +For seaking all atom ids in CCD database: +```shell +helixfold_show_ccd +ccd_id=HEM +``` + +This command outputs like: +```text +# output: +[2024-08-23 22:44:36,324][absl][INFO] - Started Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz +[2024-08-23 22:44:43,236][absl][INFO] - Finished Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz in 6.912 seconds +[2024-08-23 22:44:43,237][absl][INFO] - CCD dataset contains 43488 entries. +[2024-08-23 22:44:43,237][absl][INFO] - Atoms in HEM: ['CHA', 'CHB', 'CHC', 'CHD', 'C1A', 'C2A', 'C3A', 'C4A', 'CMA', 'CAA', 'CBA', 'CGA', 'O1A', 'O2A', 'C1B', 'C2B', 'C3B', 'C4B', 'CMB', 'CAB', 'CBB', 'C1C', 'C2C', 'C3C', 'C4C', 'CMC', 'CAC', 'CBC', 'C1D', 'C2D', 'C3D', 'C4D', 'CMD', 'CAD', 'CBD', 'CGD', 'O1D', 'O2D', 'NA', 'NB', 'NC', 'ND', 'FE'] +``` + #### Running HelixFold for Inference To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.: diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json index a3890224..7b2550ff 100644 --- a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json +++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json @@ -13,7 +13,8 @@ { "type": "bond", "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3", - "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA" + "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA.", + "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-` where index is the chain order you input" } ] } \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json new file mode 100644 index 00000000..6761a4bc --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json @@ -0,0 +1,25 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP", + "count": 1 + }, + { + "type": "ligand", + "ccd": "HEM", + "count": 1 + }, + { + "type": "ligand", + "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C", + "count": 1 + }, + { + "type": "bond", + "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3", + "_comment": ",,,,,,,,,", + "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-` where index is the chain order you input" + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index e18029a5..8a862279 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -31,6 +31,7 @@ import hydra from helixfold.common import all_atom_pdb_save +from helixfold.data.pipeline_conf_bonds import load_ccd_dict from helixfold.model import config, utils from helixfold.data import pipeline_parallel as pipeline from helixfold.data import pipeline_multimer_parallel as pipeline_multimer @@ -565,5 +566,24 @@ def main(cfg: DictConfig): ranking_all_predictions(all_pred_path) print(f'============ Inference finished ! ============') + +@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold') +def show_atom_id_ccd(cfg: DictConfig): + ccd_preprocessed_path = cfg.db.ccd_preprocessed + + + ccd_id=cfg.ccd_id + if len(ccd_id) <= 3 and ccd_id in (ccd_dict:=load_ccd_dict(ccd_preprocessed_path)): + logging.info(f'Atoms in {ccd_id}: {ccd_dict[ccd_id]["atom_ids"]}') + return + + + + + + + + + if __name__ == '__main__': main() diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml index 8c6dc9f7..33956227 100644 --- a/apps/protein_folding/helixfold3/pyproject.toml +++ b/apps/protein_folding/helixfold3/pyproject.toml @@ -47,3 +47,4 @@ joblib = "1.4.2" [tool.poetry.scripts] helixfold = 'helixfold.inference:main' +helixfold_show_ccd = 'helixfold.inference:show_atom_id_ccd' From b718ced5d4f3137fae795647b9f349853fda400a Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 24 Aug 2024 11:18:43 +0800 Subject: [PATCH 37/42] Update README.md --- apps/protein_folding/helixfold3/README.md | 122 ++++++++++++++++++++-- 1 file changed, 112 insertions(+), 10 deletions(-) diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index 0eaa1512..8c810ccd 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -72,14 +72,22 @@ Note: If you have a different version of python3 and cuda, please refer to [here #### Install Maxit The conversion between `.cif` and `.pdb` relies on [Maxit](https://sw-tools.rcsb.org/apps/MAXIT/index.html). Download Maxit source code from https://sw-tools.rcsb.org/apps/MAXIT/maxit-v11.100-prod-src.tar.gz. Untar and follow -its `README` to complete installation. +its `README` to complete installation. If you encouter error like your GCC version not support (9.4.0, for example), editing `etc/platform.sh` and reruning compilation again would make sense. See below: + +```bash +# Check if it is a Linux platform + Linux) +# Check if it is GCC version 4.x + gcc_ver=`gcc --version | grep -e " 4\."` # edit `4\.` to `9\.` + if [[ -z $gcc_ver ]] +``` ### Usage In order to run HelixFold3, the genetic databases and model parameters are required. The parameters of HelixFold3 can be downloaded [here](https://paddlehelix.bd.bcebos.com/HelixFold3/params/HelixFold3-params-240814.zip), -please place the downloaded checkpoint in ```./init_models/ ```directory. +please place the downloaded checkpoint path in `weight_path` of `helixfold/config/helixfold.yaml` configuration file before install HF3 as a python module. The script `scripts/download_all_data.sh` can be used to download and set up all genetic databases with the following configs: @@ -109,6 +117,7 @@ HelixFold3 supports input ligand as SMILES, CCD id or small molecule files, plea for more details about SMILES input. Flexible input from small molecule is now supported. See `obabel -L formats |grep -v 'Write-only'` A example of input data is as follows: + ```json { "entities": [ @@ -127,6 +136,7 @@ A example of input data is as follows: ``` Another example of **covalently modified** input: + ```json { "entities": [ @@ -149,18 +159,21 @@ Another example of **covalently modified** input: "type": "bond", "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3", "_comment": ",,,,,,,,,", - "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-` where index is the chain order you input" + "_another_comment": "use semicolon to separate multiple bonds", + "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-` where index is the chain order you input. eg. `UNK-1` for the first ligand chain(or the count #1), `UNK-2` the second(or the count #2)." } ] } ``` For seaking all atom ids in CCD database: + ```shell helixfold_show_ccd +ccd_id=HEM ``` This command outputs like: + ```text # output: [2024-08-23 22:44:36,324][absl][INFO] - Started Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz @@ -169,10 +182,97 @@ This command outputs like: [2024-08-23 22:44:43,237][absl][INFO] - Atoms in HEM: ['CHA', 'CHB', 'CHC', 'CHD', 'C1A', 'C2A', 'C3A', 'C4A', 'CMA', 'CAA', 'CBA', 'CGA', 'O1A', 'O2A', 'C1B', 'C2B', 'C3B', 'C4B', 'CMB', 'CAB', 'CBB', 'C1C', 'C2C', 'C3C', 'C4C', 'CMC', 'CAC', 'CBC', 'C1D', 'C2D', 'C3D', 'C4D', 'CMD', 'CAD', 'CBD', 'CGD', 'O1D', 'O2D', 'NA', 'NB', 'NC', 'ND', 'FE'] ``` +For seaking all atom ids in a given `sdf`/`mol2`, the atom list follows the same order in its file. + +HF3 parsed: + +```text +['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'N1', 'O1', 'O2', 'O3', 'O4', 'O5', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'N2', 'O6', 'O7', 'O8', 'O9', 'O10', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'O11', 'O12', 'O13', 'O14', 'O15', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'O16', 'O17', 'O18', 'O19', 'O20', 'C29', 'C30', 'C31', 'C32', 'C33', 'C34', 'O21', 'O22', 'O23', 'O24', 'O25', 'C35', 'C36', 'C37', 'C38', 'C39', 'C40', 'O26', 'O27', 'O28', 'O29', 'O30'] +``` + +while in `SDF`: + +```text + 29.7340 3.2540 76.7430 C 0 0 0 0 0 2 0 0 0 0 0 0 + 29.8160 4.4760 77.6460 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.5260 5.2840 77.5530 C 0 0 2 0 0 3 0 0 0 0 0 0 + 28.1780 5.5830 76.1020 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.2350 4.3240 75.2420 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.1040 4.6170 73.7650 C 0 0 0 0 0 2 0 0 0 0 0 0 + 31.3020 3.8250 79.4830 C 0 0 0 0 0 0 0 0 0 0 0 0 + 31.3910 3.4410 80.9280 C 0 0 0 0 0 1 0 0 0 0 0 0 + 30.0760 4.0880 79.0210 N 0 0 0 0 0 2 0 0 0 0 0 0 + 28.6870 6.5050 78.2670 O 0 0 0 0 0 1 0 0 0 0 0 0 + 26.8490 6.0910 76.0350 O 0 0 0 0 0 0 0 0 0 0 0 0 + 29.4950 3.6650 75.4130 O 0 0 0 0 0 0 0 0 0 0 0 0 + 29.3670 4.5550 73.1150 O 0 0 0 0 0 1 0 0 0 0 0 0 + 32.2950 3.8940 78.7640 O 0 0 0 0 0 0 0 0 0 0 0 0 + 26.7420 7.4140 75.6950 C 0 0 1 0 0 3 0 0 0 0 0 0 + 25.2700 7.7830 75.6110 C 0 0 1 0 0 3 0 0 0 0 0 0 + 25.1290 9.2300 75.1610 C 0 0 2 0 0 3 0 0 0 0 0 0 + 25.9180 10.1440 76.0880 C 0 0 1 0 0 3 0 0 0 0 0 0 + 27.3630 9.6720 76.2210 C 0 0 1 0 0 3 0 0 0 0 0 0 + 28.1310 10.4360 77.2730 C 0 0 0 0 0 2 0 0 0 0 0 0 + 23.8820 5.8170 75.1400 C 0 0 0 0 0 0 0 0 0 0 0 0 + 23.1980 5.0100 74.0810 C 0 0 0 0 0 1 0 0 0 0 0 0 + 24.5530 6.8930 74.7160 N 0 0 0 0 0 2 0 0 0 0 0 0 + 23.7530 9.5950 75.1670 O 0 0 0 0 0 1 0 0 0 0 0 0 + 25.9170 11.4700 75.5730 O 0 0 0 0 0 0 0 0 0 0 0 0 + 27.4050 8.2900 76.6040 O 0 0 0 0 0 0 0 0 0 0 0 0 + 29.5300 10.4030 77.0280 O 0 0 0 0 0 1 0 0 0 0 0 0 + 23.8300 5.5110 76.3290 O 0 0 0 0 0 0 0 0 0 0 0 0 + 25.3940 12.4250 76.4090 C 0 0 1 0 0 3 0 0 0 0 0 0 + 25.9490 13.7680 75.9090 C 0 0 2 0 0 3 0 0 0 0 0 0 + 25.1320 14.9560 76.4900 C 0 0 2 0 0 3 0 0 0 0 0 0 + 23.6130 14.6900 76.6390 C 0 0 1 0 0 3 0 0 0 0 0 0 + 23.3700 13.3000 77.2280 C 0 0 1 0 0 3 0 0 0 0 0 0 + 21.9020 12.9360 77.3500 C 0 0 0 0 0 2 0 0 0 0 0 0 + 25.9010 13.8490 74.4810 O 0 0 0 0 0 1 0 0 0 0 0 0 + 25.3420 16.1410 75.7110 O 0 0 0 0 0 0 0 0 0 0 0 0 + 23.0420 15.6520 77.5170 O 0 0 0 0 0 1 0 0 0 0 0 0 + 23.9910 12.3690 76.3570 O 0 0 0 0 0 0 0 0 0 0 0 0 + 21.3660 12.8480 76.0500 O 0 0 0 0 0 0 0 0 0 0 0 0 + 20.8090 11.6500 75.6780 C 0 0 2 0 0 3 0 0 0 0 0 0 + 20.6800 11.6410 74.1740 C 0 0 2 0 0 3 0 0 0 0 0 0 + 19.5510 12.5850 73.8180 C 0 0 2 0 0 3 0 0 0 0 0 0 + 18.2370 12.0940 74.4540 C 0 0 1 0 0 3 0 0 0 0 0 0 + 18.4030 11.9240 75.9810 C 0 0 1 0 0 3 0 0 0 0 0 0 + 17.2710 11.1260 76.6120 C 0 0 0 0 0 2 0 0 0 0 0 0 + 20.2900 10.3510 73.7080 O 0 0 0 0 0 1 0 0 0 0 0 0 + 19.4280 12.7380 72.4110 O 0 0 0 0 0 0 0 0 0 0 0 0 + 17.2120 13.0460 74.2030 O 0 0 0 0 0 1 0 0 0 0 0 0 + 19.6260 11.2000 76.3010 O 0 0 0 0 0 0 0 0 0 0 0 0 + 16.0670 11.4490 75.9360 O 0 0 0 0 0 1 0 0 0 0 0 0 + 20.2190 13.6280 71.7260 C 0 0 2 0 0 3 0 0 0 0 0 0 + 19.6090 14.0000 70.3810 C 0 0 2 0 0 3 0 0 0 0 0 0 + 19.6360 12.7820 69.4880 C 0 0 2 0 0 3 0 0 0 0 0 0 + 21.0860 12.3100 69.3240 C 0 0 1 0 0 3 0 0 0 0 0 0 + 21.7030 12.0240 70.7120 C 0 0 1 0 0 3 0 0 0 0 0 0 + 23.1940 11.7460 70.6620 C 0 0 0 0 0 2 0 0 0 0 0 0 + 20.4080 14.9810 69.7000 O 0 0 0 0 0 1 0 0 0 0 0 0 + 19.0310 13.0500 68.2340 O 0 0 0 0 0 1 0 0 0 0 0 0 + 21.1060 11.1280 68.5380 O 0 0 0 0 0 1 0 0 0 0 0 0 + 21.5380 13.1700 71.5840 O 0 0 0 0 0 0 0 0 0 0 0 0 + 23.8240 12.5210 71.6820 O 0 0 0 0 0 1 0 0 0 0 0 0 + 26.0070 17.3020 76.0200 C 0 0 2 0 0 3 0 0 0 0 0 0 + 27.0750 17.5250 74.9350 C 0 0 2 0 0 3 0 0 0 0 0 0 + 28.3660 16.8320 75.3290 C 0 0 2 0 0 3 0 0 0 0 0 0 + 28.7820 17.2470 76.7510 C 0 0 1 0 0 3 0 0 0 0 0 0 + 27.6930 16.8120 77.7320 C 0 0 1 0 0 3 0 0 0 0 0 0 + 27.9770 17.2020 79.1710 C 0 0 0 0 0 2 0 0 0 0 0 0 + 27.3990 18.9140 74.8010 O 0 0 0 0 0 1 0 0 0 0 0 0 + 29.4060 17.0990 74.3950 O 0 0 0 0 0 1 0 0 0 0 0 0 + 30.0160 16.6410 77.0930 O 0 0 0 0 0 1 0 0 0 0 0 0 + 26.4610 17.4820 77.3520 O 0 0 0 0 0 0 0 0 0 0 0 0 + 27.3660 18.4620 79.4040 O 0 0 0 0 0 1 0 0 0 0 0 0 +``` + #### Running HelixFold for Inference + To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.: ##### Run from default config + ```shell LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ helixfold \ @@ -182,6 +282,7 @@ helixfold \ ``` ##### Run with customized configuration dir and file(`./myfold.yaml`, for example): + ```shell LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ helixfold --config-dir=. --config-name=myfold \ @@ -191,14 +292,15 @@ helixfold --config-dir=. --config-name=myfold \ ``` ##### Run with additional configuration term + ```shell LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ helixfold \ input=./data/demo_6zcy.json \ output=. \ - CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \ - CONFIG_DIFFS.model.global_config.subbatch_size=192 \ - CONFIG_DIFFS.model.num_recycle=10 + CONFIG_DIFFS.preset=allatom_demo \ + +CONFIG_DIFFS.model.global_config.subbatch_size=192 \ + +CONFIG_DIFFS.model.num_recycle=10 ``` The descriptions of the above script are as follows: @@ -216,7 +318,7 @@ The outputs will be in a subfolder of `output_dir`, including the computed MSAs, ranked structures, and evaluation metrics. For a task of inferring twice with diffusion batch size 3, assume your input JSON is named `demo_data.json`, the `output_dir` directory will have the following structure: -``` +```text / └── demo_data/ ├── demo_data-pred-1-1/ @@ -240,9 +342,10 @@ assume your input JSON is named `demo_data.json`, the `output_dir` directory wil └── ... ``` + The contents of each output file are as follows: * `final_features.pkl` – A `pickle` file containing the input feature NumPy arrays - used by the models to predict the structures. + used by the models to predict the structures. If you need to re-run a inference without re-building the MSAs, delete this file. * `msas/` - A directory containing the files describing the various genetic tool hits that were used to construct the input MSA. * `demo_data-pred-X-Y` - Prediction results of `demo_data.json` in X-th inference and Y-thdiffusion batch, @@ -256,8 +359,7 @@ We suggest a single GPU for inference has at least 32G available memory. The max single V100-32G with precision `fp32` is up to 1000. Inferring longer tokens or entities with larger atom numbers per token than normal protein residues like nucleic acids may cost more GPU memory. -For samples with larger tokens, you can reduce `model.global_config.subbatch_size` in `CONFIG_DIFFS` in `helixfold/model/config.py` to save more GPU memory but suffer from slower inference. `model.global_config.subbatch_size` is set as `96` by default. You can also -reduce the number of additional recycles by changing `model.num_recycle` in the same place. +For samples with larger tokens, you can override `model.global_config.subbatch_size` in `CONFIG_ALLATOM` by using `+CONFIG_DIFFS.model.global_config.subbatch_size=X` on command runs, where `X` is a smaller number than `96`, to save more GPU memory although this will cause a slower inference. Additionally, you can reduce the number of additional recycles by setting `+CONFIG_DIFFS.model.num_recycle=Y`, where `Y` is a smaller number than `3`. We are keen on support longer token inference, it will come in soon. From b010c13a490f6b21677c28d4830edf2ad4417a38 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 24 Aug 2024 13:00:54 +0800 Subject: [PATCH 38/42] feat: disulfide bonding --- .../helixfold3/data/demo_disulf.json | 14 ++++++++++++++ .../helixfold/data/pipeline_conf_bonds.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 apps/protein_folding/helixfold3/data/demo_disulf.json diff --git a/apps/protein_folding/helixfold3/data/demo_disulf.json b/apps/protein_folding/helixfold3/data/demo_disulf.json new file mode 100644 index 00000000..780b9d7d --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_disulf.json @@ -0,0 +1,14 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "MASVKSSSSSSSSSFISLLLLILLVIVLQSQVIECQPQQSCTASLTGLNVCAPFLVPGSPTASTECCNAVQSINHDCMCNTMRIAAQIPAQCNLPPLSCSAN", + "count": 1 + }, + { + "type": "bond", + "bond": "A,CYS,41,SG,A,CYS,77,SG,disulf,2.2;A,CYS,51,SG,A,CYS,66,SG,disulf,2.2;A,CYS,67,SG,A,CYS,92,SG,disulf,2.2;A,CYS,79,SG,A,CYS,99,SG,disulf,2.2", + "_case_from": "https://www.uniprot.org/uniprotkb/Q43495/entry#ptm_processing" + } + ] +} \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index 9e5b7d2e..a7d2708e 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -115,7 +115,7 @@ def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]: # Append the CovalentBond instance to the list covalent_bonds.append(covalent_bond) - logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}") + logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}") return covalent_bonds From 22bbe7ade88f6547571a4f8e09df849e727ce6a4 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 24 Aug 2024 13:35:30 +0800 Subject: [PATCH 39/42] feat: disulfide bonding --- .../data/demo_disulf_homodimer.json | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json diff --git a/apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json b/apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json new file mode 100644 index 00000000..fd56fcd4 --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json @@ -0,0 +1,33 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "NSVHPCCDPVICEPREGEHCISGPCCENCYFLNSGTICKRARGDGNQDYCTGITPDCPRNRYNV", + "count": 2 + }, + { + "type": "bond", + "bond": "A,CYS,6,SG,A,CYS,29,SG,disulf,2.3;A,CYS,20,SG,A,CYS,26,SG,disulf,2.3;A,CYS,25,SG,A,CYS,50,SG,disulf,2.3;A,CYS,38,SG,A,CYS,57,SG,disulf,2.3", + "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing", + "_note": "Intrachain, A" + }, + { + "type": "bond", + "bond": "B,CYS,6,SG,B,CYS,29,SG,disulf,2.3;B,CYS,20,SG,B,CYS,26,SG,disulf,2.3;B,CYS,25,SG,B,CYS,50,SG,disulf,2.3;B,CYS,38,SG,B,CYS,57,SG,disulf,2.3", + "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing", + "_note": "Intrachain, B" + }, + { + "type": "bond", + "bond": "A,CYS,7,SG,B,CYS,12,SG,disulf,2.3", + "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing", + "_note": "Interchain, AB" + }, + { + "type": "bond", + "bond": "B,CYS,7,SG,A,CYS,12,SG,disulf,2.3", + "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing", + "_note": "Interchain, BA" + } + ] +} \ No newline at end of file From 0026e5d634cc63260a268ef15bf3d13ac47c8056 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:12:45 +0800 Subject: [PATCH 40/42] case: metalc --- .../helixfold3/data/demo_4Fe-4S.json | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 apps/protein_folding/helixfold3/data/demo_4Fe-4S.json diff --git a/apps/protein_folding/helixfold3/data/demo_4Fe-4S.json b/apps/protein_folding/helixfold3/data/demo_4Fe-4S.json new file mode 100644 index 00000000..4599c4af --- /dev/null +++ b/apps/protein_folding/helixfold3/data/demo_4Fe-4S.json @@ -0,0 +1,45 @@ +{ + "entities": [ + { + "type": "protein", + "sequence": "MKLNVDGLLVYFPYDYIYPEQFSYMRELKRTLDAKGHGVLEMPSGTGKTVSLLALIMAYQRAYPLEVTKLIYCSRTVPEIEKVIEELRKLLNFYEKQEGEKLPFLGLALSSRKNLCIHPEVTPLRFGKDVDGKCHSLTASYVRAQYQHDTSLPHCRFYEEFDAHGREVPLPAGIYNLDDLKALGRRQGWCPYFLARYSILHANVVVYSYHYLLDPKIADLVSKELARKAVVVFDEAHNIDNVCIDSMSVNLTRRTLDRCQGNLETLQKTVLRIKETDEQRLRDEYRRLVEGLREASAARETDAHLANPVLPDEVLQEAVPGSIRTAEHFLGFLRRLLEYVKWRLRVQHVVQESPPAFLSGLAQRVCIQRKPLRFCAERLRSLLHTLEITDLADFSPLTLLANFATLVSTYAKGFTIIIEPFDDRTPTIANPILHFSCMDASLAIKPVFERFQSVIITSGTLSPLDIYPKILDFHPVTMATFTMTLARVCLCPMIIGRGNDQVAISSKFETREDIAVIRNYGNLLLEMSAVVPDGIVAFFTSYQYMESTVASWYEQGILENIQRNKLLFIETQDGAETSVALEKYQEACENGRGAILLSVARGKVSEGIDFVHHYGRAVIMFGVPYVYTQSRILKARLEYLRDQFQIRENDFLTFDAMRHAAQCVGRAIRGKTDYGLMVFADKRFARGDKRGKLPRWIQEHLTDANLNLTVDEGVQVAKYFLRQMAQPFHREDQLGLSLLSLEQLESEETLKRIEQIAQQL", + "count": 1 + }, + { + "type": "ligand", + "ccd": "SF4", + "count": 1, + "_note": "5T5I" + }, + { + "type": "bond", + "bond": "A,CYS,116,SG,B,SF4,1,FE1,metalc,2.2;A,CYS,134,SG,B,SF4,1,FE2,metalc,2.2;A,CYS,155,SG,B,SF4,1,FE3,metalc,2.2;A,CYS,190,SG,B,SF4,1,FE4,metalc,2.2", + "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry", + "_note":"ALL_CYS-ALL_FE" + }, + { + "type": "bond", + "bond": "B,SF4,1,FE1,B,SF4,1,S2,metalc,2.2;B,SF4,1,FE1,B,SF4,1,S3,metalc,2.2;B,SF4,1,FE1,B,SF4,1,S4,metalc,2.2", + "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry", + "_note":"FE1-S234" + }, + { + "type": "bond", + "bond": "B,SF4,1,FE2,B,SF4,1,S1,metalc,2.2;B,SF4,1,FE2,B,SF4,1,S3,metalc,2.2;B,SF4,1,FE2,B,SF4,1,S4,metalc,2.2", + "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry", + "_note":"FE2-S134" + }, + { + "type": "bond", + "bond": "B,SF4,1,FE3,B,SF4,1,S1,metalc,2.2;B,SF4,1,FE3,B,SF4,1,S2,metalc,2.2;B,SF4,1,FE3,B,SF4,1,S4,metalc,2.2", + "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry", + "_note":"FE3-S124" + }, + { + "type": "bond", + "bond": "B,SF4,1,FE4,B,SF4,1,S1,metalc,2.2;B,SF4,1,FE14,B,SF4,1,S2,metalc,2.2;B,SF4,1,FE4,B,SF4,1,S3,metalc,2.2", + "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry", + "_note":"FE4-S123" + } + ] +} \ No newline at end of file From 658cf28bc238cd3808ac8de3748b063cb8bd143f Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 24 Aug 2024 12:26:00 +0800 Subject: [PATCH 41/42] Update pipeline_conf_bonds.py --- .../helixfold3/helixfold/data/pipeline_conf_bonds.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index a7d2708e..6b46bb18 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -27,7 +27,8 @@ } # Define the possible bond types as a Literal -BondType = Literal["covale", "metal", "hydrogen", "ionic", "disulfide", "aromatic"] +# https://mmcif.wwpdb.org/dictionaries/mmcif_std.dic/Items/_struct_conn_type.id.html +BondType = Literal["covale", "covale_base", "covale_phosphate", "covale_sugar", "disulf", "hydrog",'metalc','mismat','modres','saltbr'] @dataclass @@ -281,8 +282,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr right_bond_idx = 1 if right_bond_idx == '.' else right_bond_idx if _bond.bond_type != "covale": - logging.warning(f'Ignore non-covale bond type: {_bond.bond_type}') - continue + logging.warning(f'Detected non-covale bond type: {_bond.bond_type}') + # continue if _bond.pdbx_dist_value > 2.4: # the covalent_bond is cut off by distance=2.4 From ab7a4242d90527bc8864f742597a82845c15b9df Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:27:45 +0800 Subject: [PATCH 42/42] chore: cpu only for msa only --- .../helixfold3/helixfold/inference.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index 8a862279..57fd692c 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -453,6 +453,12 @@ def split_prediction(pred, rank): def main(cfg: DictConfig): logging.set_verbosity(cfg.logging_level) + if cfg.msa_only == True: + logging.warning(f'Model inference will be skipped because MSA-only mode is required.') + logging.warning(f'Use CPU only') + paddle.device.set_device("cpu") + + """main function""" new_einsum = os.getenv("FLAGS_new_einsum", True) print(f'>>> PaddlePaddle commit: {paddle.version.commit}') @@ -505,6 +511,8 @@ def main(cfg: DictConfig): model.helixfold.set_state_dict(pd_params['model']) else: model.helixfold.set_state_dict(pd_params) + + if cfg.precision == "bf16" and cfg.amp_level == "O2": raise NotImplementedError("bf16 O2 is not supported yet.") @@ -531,6 +539,10 @@ def main(cfg: DictConfig): with open(features_pkl, 'wb') as f: pickle.dump(feature_dict, f, protocol=4) + if cfg.msa_only == True: + logging.warning(f'Model inference is skipped because MSA-only mode is required.') + exit() + feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True) feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True)