From 662bbff1a37c9cf9ce57f9007c89f89576e740ab Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:34:27 +0800
Subject: [PATCH 01/42] chore: hydra and pip

---
 .../helixfold/common/all_atom_pdb_save.py     |   7 +-
 .../helixfold/config/helixfold.yaml           |  59 ++++
 .../infer_scripts/feature_processing_aa.py    |   0
 .../infer_scripts/preprocess.py               |  13 +-
 .../infer_scripts/tools/mmcif_writer.py       |   0
 .../helixfold3/{ => helixfold}/inference.py   | 289 +++++++-----------
 .../{ => helixfold}/utils/__init__.py         |   0
 .../helixfold3/{ => helixfold}/utils/misc.py  |   0
 .../helixfold3/{ => helixfold}/utils/model.py |   0
 .../helixfold3/{ => helixfold}/utils/utils.py |   0
 .../protein_folding/helixfold3/pyproject.toml |  48 +++
 apps/protein_folding/helixfold3/setup_env.sh  |  18 ++
 12 files changed, 248 insertions(+), 186 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
 rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/feature_processing_aa.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/preprocess.py (97%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/tools/mmcif_writer.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/inference.py (64%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/__init__.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/misc.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/model.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/utils.py (100%)
 create mode 100644 apps/protein_folding/helixfold3/pyproject.toml
 create mode 100644 apps/protein_folding/helixfold3/setup_env.sh

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index deb8e087..9c9f288e 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -164,10 +164,13 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
     - maxit_binary: path to maxit_binary, use to convert pdb to cif
     - mmcif_path: path to save *.cif
   """
-  assert maxit_binary is not None and os.path.exists(maxit_binary), (
+  if os.path.isfile(maxit_binary):
+    raise FileNotFoundError(
       f'maxit_binary: {maxit_binary} not exists. '
       f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html')
-  assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}'
+  
+  if not mmcif_path.endswith('.cif'):
+     raise ValueError(f'mmcif_path should endswith .cif; got {mmcif_path}')
 
   pdb_path = mmcif_path.replace('.cif', '.pdb')
   pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path)
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
new file mode 100644
index 00000000..fd70ada0
--- /dev/null
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -0,0 +1,59 @@
+defaults:
+  - _self_
+
+# General configuration
+
+bf16_infer: false  # Corresponds to --bf16_infer
+seed: null  # Corresponds to --seed
+logging_level: DEBUG  # Corresponds to --logging_level
+job_id: 'structure_prediction'  # Corresponds to --model_name
+weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams  # Corresponds to --init_model
+precision: fp32  # Corresponds to --precision
+amp_level: O1  # Corresponds to --amp_level
+infer_times: 1  # Corresponds to --infer_times
+diff_batch_size: -1  # Corresponds to --diff_batch_size
+use_small_bfd: false # Corresponds to --use_small_bfd
+
+# File paths
+
+input: null  # Corresponds to --input_json, required field
+output: null  # Corresponds to --output_dir, required field
+
+
+# Binary tool paths, leave them as null to find proper ones under PATH or conda bin path
+bin:
+  jackhmmer: null    # Corresponds to --jackhmmer_binary_path
+  hhblits: null  # Corresponds to --hhblits_binary_path
+  hhsearch: null   # Corresponds to --hhsearch_binary_path
+  kalign: null  # Corresponds to --kalign_binary_path
+  hmmsearch: null  # Corresponds to --hmmsearch_binary_path
+  hmmbuild: null  # Corresponds to --hmmbuild_binary_path
+  nhmmer: null  # Corresponds to --nhmmer_binary_path
+  obabel: null
+
+# Database paths
+db:
+  uniprot: /mnt/db/uniprot/uniprot.fasta  # Corresponds to --uniprot_database_path, required field
+  pdb_seqres: /mnt/db/pdb_seqres/pdb_seqres.txt  # Corresponds to --pdb_seqres_database_path, required field
+  uniref90: /mnt/db/uniref90/uniref90.fasta  # Corresponds to --uniref90_database_path, required field
+  mgnify: /mnt/db/mgnify/mgy_clusters.fa  # Corresponds to --mgnify_database_path, required field
+  bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt  # Corresponds to --bfd_database_path
+  small_bfd: null  # Corresponds to --small_bfd_database_path
+  uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02  # Corresponds to --uniclust30_database_path
+  rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta  # Corresponds to --rfam_database_path, required field
+  ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz  # Corresponds to --ccd_preprocessed_path, required field
+
+# Template and PDB information
+template:
+  mmcif_dir: /mnt/db/pdb_mmcif/mmcif_files  # Corresponds to --template_mmcif_dir, required field
+  max_date: '2023-03-15'  # Corresponds to --max_template_date, required field
+  obsolete_pdbs: /mnt/db/pdb_mmcif/obsolete.dat  # Corresponds to --obsolete_pdbs_path, required field
+
+# Preset configuration
+preset:
+  preset: full_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
+
+# Other configurations
+other:
+  maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
+  no_msa_templ_feats: false  # Corresponds to --no_msa_templ_feats
diff --git a/apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
similarity index 100%
rename from apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py
rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
diff --git a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
similarity index 97%
rename from apps/protein_folding/helixfold3/infer_scripts/preprocess.py
rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index 41cd44ac..eb8eb14f 100644
--- a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -5,17 +5,17 @@
         'seqs': ccd_seqs,
         'msa_seqs': msa_seqs,
         'count': count,
-        'extra_mol_infos': {}， for which seqs has the modify residue type or smiles.
+        'extra_mol_infos': {}, for which seqs has the modify residue type or smiles.
 """
 import collections
 import copy
+import gzip
 import os
 import json
 import sys
 import subprocess
 import tempfile
 import itertools
-sys.path.append('../')
 import rdkit
 from rdkit import Chem
 from rdkit.Chem import AllChem
@@ -52,9 +52,7 @@
     3: 'Unknown error.'
 }
 
-OBABEL_BIN = os.getenv('OBABEL_BIN')
-if not os.path.exists(OBABEL_BIN):
-    raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')
+
 
 
 def read_json(path):
@@ -144,6 +142,11 @@ def smiles_toMol_obabel(smiles):
     """
         generate mol from smiles using obabel;
     """    
+    
+    OBABEL_BIN = os.getenv('OBABEL_BIN')
+    if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)):
+        raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')
+    
     with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
         print(f"[OBABEL] Temporary file created: {temp_file.name}")
         obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d"
diff --git a/apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py
similarity index 100%
rename from apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py
rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py
diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
similarity index 64%
rename from apps/protein_folding/helixfold3/inference.py
rename to apps/protein_folding/helixfold3/helixfold/inference.py
index 51cf6ec6..b3fbf745 100644
--- a/apps/protein_folding/helixfold3/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -16,7 +16,6 @@
 import re
 import os
 import copy
-import argparse
 import random
 import paddle
 import json
@@ -25,6 +24,11 @@
 import shutil
 import logging
 import numpy as np
+import shutil
+
+from omegaconf import DictConfig
+import hydra
+
 from helixfold.common import all_atom_pdb_save
 from helixfold.model import config, utils
 from helixfold.data import pipeline_parallel as pipeline
@@ -34,12 +38,14 @@
 from helixfold.data.utils import atom_level_keys, map_to_continuous_indices
 from helixfold.data.tools import hmmsearch
 from helixfold.data import templates
-from utils.utils import get_custom_amp_list
-from utils.model import RunModel
-from utils.misc import set_logging_level
+from helixfold.utils.utils import get_custom_amp_list
+from helixfold.utils.model import RunModel
+from helixfold.utils.misc import set_logging_level
 from typing import Dict
-from infer_scripts import feature_processing_aa, preprocess
-from infer_scripts.tools import mmcif_writer
+from helixfold.infer_scripts import feature_processing_aa, preprocess
+from helixfold.infer_scripts.tools import mmcif_writer
+
+script_path=os.path.dirname(__file__)
 
 ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP
 INVERSE_ALLOWED_LIGAND_BONDS_TYPE_MAP = {
@@ -105,45 +111,57 @@ def convert_to_json_compatible(obj):
         return [convert_to_json_compatible(i) for i in obj]
     else:
         return obj
-    
-def get_msa_templates_pipeline(args) -> Dict:
-    use_precomputed_msas = True # FLAGS.use_precomputed_msas
+
+def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str:
+    """Helper function to resolve the binary path."""
+    if cfg_path and os.path.isfile(cfg_path):
+        return cfg_path
+
+    if cfg_val:=shutil.which(default_binary_name):
+        logging.warning(f'Using resolved {default_binary_name}: {cfg_val}')
+        return cfg_val
+
+    raise FileNotFoundError(f"Could not find a proper binary path for {default_binary_name}: {cfg_path}.")
+
+def get_msa_templates_pipeline(cfg: DictConfig) -> Dict:
+    use_precomputed_msas = True  # Assuming this is a constant or should be set globally
+
     template_searcher = hmmsearch.Hmmsearch(
-        binary_path=args.hmmsearch_binary_path,
-        hmmbuild_binary_path=args.hmmbuild_binary_path,
-        database_path=args.pdb_seqres_database_path)
+        binary_path=resolve_bin_path(cfg.bin.hmmsearch, 'hmmsearch'),
+        hmmbuild_binary_path=resolve_bin_path(cfg.bin.hmmbuild, 'hmmbuild'),
+        database_path=cfg.db.pdb_seqres)
 
     template_featurizer = templates.HmmsearchHitFeaturizer(
-        mmcif_dir=args.template_mmcif_dir,
-        max_template_date=args.max_template_date,
+        mmcif_dir=cfg.template.mmcif_dir,
+        max_template_date=cfg.template.max_date,
         max_hits=MAX_TEMPLATE_HITS,
-        kalign_binary_path=args.kalign_binary_path,
+        kalign_binary_path=resolve_bin_path(cfg.bin.kalign, 'kalign'),
         release_dates_path=None,
-        obsolete_pdbs_path=args.obsolete_pdbs_path)
+        obsolete_pdbs_path=cfg.template.obsolete_pdbs)
 
     monomer_data_pipeline = pipeline.DataPipeline(
-        jackhmmer_binary_path=args.jackhmmer_binary_path,
-        hhblits_binary_path=args.hhblits_binary_path,
-        hhsearch_binary_path=args.hhsearch_binary_path,
-        uniref90_database_path=args.uniref90_database_path,
-        mgnify_database_path=args.mgnify_database_path,
-        bfd_database_path=args.bfd_database_path,
-        uniclust30_database_path=args.uniclust30_database_path,
-        small_bfd_database_path=args.small_bfd_database_path ,
+        jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'),
+        hhblits_binary_path=resolve_bin_path(cfg.bin.hhblits, 'hhblits'),
+        hhsearch_binary_path=resolve_bin_path(cfg.bin.hhsearch, 'hhsearch'),
+        uniref90_database_path=cfg.db.uniref90,
+        mgnify_database_path=cfg.db.mgnify,
+        bfd_database_path=cfg.db.bfd,
+        uniclust30_database_path=cfg.db.uniclust30,
+        small_bfd_database_path=cfg.db.small_bfd,
         template_searcher=template_searcher,
         template_featurizer=template_featurizer,
-        use_small_bfd=args.use_small_bfd,
+        use_small_bfd=cfg.use_small_bfd,
         use_precomputed_msas=use_precomputed_msas)
 
     prot_data_pipeline = pipeline_multimer.DataPipeline(
         monomer_data_pipeline=monomer_data_pipeline,
-        jackhmmer_binary_path=args.jackhmmer_binary_path,
-        uniprot_database_path=args.uniprot_database_path,
+        jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'),
+        uniprot_database_path=cfg.db.uniprot,
         use_precomputed_msas=use_precomputed_msas)
 
     rna_monomer_data_pipeline = pipeline_rna.RNADataPipeline(
-      hmmer_binary_path=args.nhmmer_binary_path,
-      rfam_database_path=args.rfam_database_path,
+      hmmer_binary_path=resolve_bin_path(cfg.bin.nhmmer, 'nhmmer'),
+      rfam_database_path=cfg.db.rfam,
       rnacentral_database_path=None,
       nt_database_path=None,     
       species_identifer_map_path=None,
@@ -156,7 +174,6 @@ def get_msa_templates_pipeline(args) -> Dict:
         'protein': prot_data_pipeline,
         'rna': rna_data_pipeline
     }
-
 def ranking_all_predictions(output_dirs):
     ranking_score_path_map = {}
     for outpath in output_dirs:
@@ -176,27 +193,29 @@ def ranking_all_predictions(output_dirs):
         rank_id += 1
 
 @paddle.no_grad()
-def eval(args, model, batch):
-    """evaluate a given dataset"""
+def eval(cfg: DictConfig, model:RunModel, batch):
+    """Evaluate a given dataset"""
     model.eval()       
         
-    # inference
+    # Inference
     def _forward_with_precision(batch):
-        if args.precision == "bf16" or args.bf16_infer:
+        precision=cfg.precision
+        if precision not in ('bf16','fp32',):
+            raise ValueError("Please choose precision from bf16 and fp32!")
+
+        if cfg.precision == "bf16" or cfg.bf16_infer:
             black_list, white_list = get_custom_amp_list()
             with paddle.amp.auto_cast(enable=True,
-                                        custom_white_list=white_list, 
-                                        custom_black_list=black_list, 
-                                        level=args.amp_level, 
-                                        dtype='bfloat16'):
+                                      custom_white_list=white_list, 
+                                      custom_black_list=black_list, 
+                                      level=cfg.amp_level, 
+                                      dtype='bfloat16'):
                 return model(batch, compute_loss=False)
-        elif args.precision == "fp32":
-            return model(batch, compute_loss=False)
-        else:
-            raise ValueError("Please choose precision from bf16 and fp32! ")
+
+        return model(batch, compute_loss=False)
         
     res = _forward_with_precision(batch)
-    logger.info(f"Inference Succeeds...\n")
+    logger.info("Inference Succeeds...\n")
     return res
 
 
@@ -430,52 +449,55 @@ def split_prediction(pred, rank):
     return prediction
 
 
-def main(args):
-    set_logging_level(args.logging_level)
+@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold')
+def main(cfg: DictConfig):
+    set_logging_level(cfg.logging_level)
 
     """main function"""
     new_einsum = os.getenv("FLAGS_new_einsum", True)
     print(f'>>> PaddlePaddle commit: {paddle.version.commit}')
     print(f'>>> FLAGS_new_einsum: {new_einsum}')
-    print(f'>>> args:\n{args}')
+    print(f'>>> config:\n{cfg}')
 
-    all_entitys = preprocess_json_entity(args.input_json, args.output_dir)
+    all_entitys = preprocess_json_entity(cfg.input, cfg.output)
     ## check maxit binary path
-    if args.maxit_binary is not None:
-        assert os.path.exists(args.maxit_binary), \
-            f"The maxit binary path {args.maxit_binary} does not exists."
+    maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit')
+    
+    RCSBROOT=os.path.dirname(maxit_binary)
+    os.environ['RCSBROOT']=RCSBROOT
 
+    ## check obabel
+    obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel')
+    os.environ['OBABEL_BIN']=obabel_bin
 
-    ### set seed for reproduce experiment results
-    seed = args.seed
+    ### Set seed for reproducibility
+    seed = cfg.seed
     if seed is None:
         seed = np.random.randint(10000000)
     else:
-        logger.warning('seed is only used for reproduction')
+        logger.warning('Seed is only used for reproduction')
     init_seed(seed)
 
-
-    use_small_bfd = args.preset == 'reduced_dbs'
-    setattr(args, 'use_small_bfd', use_small_bfd)
+    use_small_bfd = cfg.preset.preset == 'reduced_dbs'
+    setattr(cfg, 'use_small_bfd', use_small_bfd)
     if use_small_bfd:
-        assert args.small_bfd_database_path is not None
+        assert cfg.db.small_bfd is not None
     else:
-        assert args.bfd_database_path is not None
-        assert args.uniclust30_database_path is not None
+        assert cfg.db.bfd is not None
+        assert cfg.db.uniclust30 is not None
 
     logger.info('Getting MSA/Template Pipelines...')
     msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args)
         
-
-    ### create model
-    model_config = config.model_config(args.model_name)
-    print(f'>>> model_config:\n{model_config}')
+    ### Create model
+    model_config = config.model_config(cfg.job_id)
+    #print(f'>>> model_config:\n{model_config}')
 
     model = RunModel(model_config)
 
-    if (not args.init_model is None) and (not args.init_model == ""):
-        print(f"Load pretrain model from {args.init_model}")
-        pd_params = paddle.load(args.init_model)
+    if (not cfg.weight_path is None) and (cfg.weight_path != ""):
+        print(f"Load pretrain model from {cfg.weight_path}")
+        pd_params = paddle.load(cfg.weight_path)
         
         has_opt = 'optimizer' in pd_params
         if has_opt:
@@ -483,42 +505,46 @@ def main(args):
         else:
             model.helixfold.set_state_dict(pd_params)
     
-    if args.precision == "bf16" and args.amp_level == "O2":
+    if cfg.precision == "bf16" and cfg.amp_level == "O2":
         raise NotImplementedError("bf16 O2 is not supported yet.")
 
     print(f"============ Data Loading ============")
-    job_base = pathlib.Path(args.input_json).stem
-    output_dir_base = pathlib.Path(args.output_dir).joinpath(job_base)
+    job_base = pathlib.Path(cfg.input).stem
+    output_dir_base = pathlib.Path(cfg.output).joinpath(job_base)
     msa_output_dir = output_dir_base.joinpath('msas')
     msa_output_dir.mkdir(parents=True, exist_ok=True)
 
     features_pkl = output_dir_base.joinpath('final_features.pkl')
-    feature_dict = feature_processing_aa.process_input_json(
-                    all_entitys, 
-                    ccd_preprocessed_path=args.ccd_preprocessed_path,
-                    msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict,
-                    msa_output_dir=msa_output_dir)
+    if features_pkl.exists():
+        with open(features_pkl, 'rb') as f:
+            feature_dict = pickle.load(f)
+    else:
+        feature_dict = feature_processing_aa.process_input_json(
+                        all_entitys, 
+                        ccd_preprocessed_path=cfg.db.ccd_preprocessed,
+                        msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict,
+                        msa_output_dir=msa_output_dir)
 
-    # save features
-    with open(features_pkl, 'wb') as f:
-        pickle.dump(feature_dict, f, protocol=4)
+        # save features
+        with open(features_pkl, 'wb') as f:
+            pickle.dump(feature_dict, f, protocol=4)
 
     feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True)
     feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True)
     
     print(f"============ Start Inference ============")
     
-    infer_times = args.infer_times
-    if args.diff_batch_size > 0:
-        model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size
+    infer_times = cfg.infer_times
+    if cfg.diff_batch_size > 0:
+        model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size
     diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size 
     logger.info(f'Inference {infer_times} Times...')
-    logger.info(f" diffusion batch size {diff_batch_size}...\n")
+    logger.info(f"Diffusion batch size {diff_batch_size}...\n")
     all_pred_path = []
     for infer_id in range(infer_times):
         
         logger.info(f'Start {infer_id}-th inference...\n')
-        prediction = eval(args, model, feature_dict)
+        prediction = eval(cfg, model, feature_dict)
         
         # save result
         prediction = split_prediction(prediction, diff_batch_size)
@@ -530,7 +556,7 @@ def main(args):
                         feature_dict=feature_dict,
                         prediction=prediction[rank_id],
                         output_dir=output_dir, 
-                        maxit_bin=args.maxit_binary)
+                        maxit_bin=cfg.other.maxit_binary)
             all_pred_path.append(output_dir)
     
     # final ranking
@@ -538,100 +564,5 @@ def main(args):
     ranking_all_predictions(all_pred_path)
     print(f'============ Inference finished ! ============')
 
-
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bf16_infer", action='store_true', default=False)
-    parser.add_argument("--seed", type=int, default=None, help="set seed for reproduce experiment results, None is do not set seed")
-    parser.add_argument("--logging_level", type=str, default="DEBUG", help="NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL")
-    parser.add_argument("--model_name", type=str, help='used to choose model config')
-    parser.add_argument("--init_model", type=str, default='')
-    parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32')
-    parser.add_argument("--amp_level", type=str, default='O1')
-    parser.add_argument("--infer_times", type=int, default=1)
-    parser.add_argument("--diff_batch_size", type=int, default=-1)
-    parser.add_argument('--input_json', type=str,
-                        default=None, required=True,
-                        help='Paths to json file, each containing '
-                        'entity information including sequence, smiles or CCD, copies etc.')
-    parser.add_argument('--output_dir', type=str,
-                        default=None, required=True,
-                        help='Path to a directory that will store results.')
-    parser.add_argument('--ccd_preprocessed_path', type=str,
-                        default=None, required=True,
-                        help='Path to CCD preprocessed files.')
-    parser.add_argument('--jackhmmer_binary_path', type=str,
-                        default='/usr/bin/jackhmmer',
-                        help='Path to the JackHMMER executable.')
-    parser.add_argument('--hhblits_binary_path', type=str,
-                        default='/usr/bin/hhblits',
-                        help='Path to the HHblits executable.')
-    parser.add_argument('--hhsearch_binary_path', type=str,
-                        default='/usr/bin/hhsearch',
-                        help='Path to the HHsearch executable.')
-    parser.add_argument('--kalign_binary_path', type=str,
-                        default='/usr/bin/kalign',
-                        help='Path to the Kalign executable.')
-    parser.add_argument('--hmmsearch_binary_path', type=str,
-                        default='/usr/bin/hmmsearch',
-                        help='Path to the hmmsearch executable.')
-    parser.add_argument('--hmmbuild_binary_path', type=str,
-                        default='/usr/bin/hmmbuild',
-                        help='Path to the hmmbuild executable.')
-
-    # binary path of the tool for RNA MSA searching
-    parser.add_argument('--nhmmer_binary_path', type=str,
-                        default='/usr/bin/nhmmer',
-                        help='Path to the nhmmer executable.')
-    
-    parser.add_argument('--uniprot_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the Uniprot database for use '
-                        'by JackHMMER.')
-    parser.add_argument('--pdb_seqres_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the PDB '
-                        'seqres database for use by hmmsearch.')
-    parser.add_argument('--uniref90_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the Uniref90 database for use '
-                        'by JackHMMER.')
-    parser.add_argument('--mgnify_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the MGnify database for use by '
-                        'JackHMMER.')
-    parser.add_argument('--bfd_database_path', type=str, default=None,
-                        help='Path to the BFD database for use by HHblits.')
-    parser.add_argument('--small_bfd_database_path', type=str, default=None,
-                        help='Path to the small version of BFD used '
-                        'with the "reduced_dbs" preset.')
-    parser.add_argument('--uniclust30_database_path', type=str, default=None,
-                        help='Path to the Uniclust30 database for use '
-                        'by HHblits.')
-    # RNA MSA searching databases
-    parser.add_argument('--rfam_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the Rfam database for RNA MSA searching.')
-    parser.add_argument('--template_mmcif_dir', type=str,
-                        default=None, required=True,
-                        help='Path to a directory with template mmCIF '
-                        'structures, each named <pdb_id>.cif')
-    parser.add_argument('--max_template_date', type=str,
-                        default=None, required=True,
-                        help='Maximum template release date to consider. '
-                        'Important if folding historical test sets.')
-    parser.add_argument('--obsolete_pdbs_path', type=str,
-                        default=None, required=True,
-                        help='Path to file containing a mapping from '
-                        'obsolete PDB IDs to the PDB IDs of their '
-                        'replacements.')
-    parser.add_argument('--preset',
-                        default='full_dbs', required=False,
-                        choices=['reduced_dbs', 'full_dbs'],
-                        help='Choose preset model configuration - '
-                        'no ensembling and smaller genetic database '
-                        'config (reduced_dbs), no ensembling and full '
-                        'genetic database config  (full_dbs)')
-    parser.add_argument('--maxit_binary', type=str, default=None)
-    args = parser.parse_args()
-    main(args)
+    main()
diff --git a/apps/protein_folding/helixfold3/utils/__init__.py b/apps/protein_folding/helixfold3/helixfold/utils/__init__.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/__init__.py
rename to apps/protein_folding/helixfold3/helixfold/utils/__init__.py
diff --git a/apps/protein_folding/helixfold3/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/misc.py
rename to apps/protein_folding/helixfold3/helixfold/utils/misc.py
diff --git a/apps/protein_folding/helixfold3/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/model.py
rename to apps/protein_folding/helixfold3/helixfold/utils/model.py
diff --git a/apps/protein_folding/helixfold3/utils/utils.py b/apps/protein_folding/helixfold3/helixfold/utils/utils.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/utils.py
rename to apps/protein_folding/helixfold3/helixfold/utils/utils.py
diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml
new file mode 100644
index 00000000..bc988ca9
--- /dev/null
+++ b/apps/protein_folding/helixfold3/pyproject.toml
@@ -0,0 +1,48 @@
+[build-system]
+requires = ["poetry-core>=1.0.0,<2.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "helixfold"
+version = "3.0.0"
+description = "Code for helixfold v3"
+authors = ["Name <email@address>"]
+
+readme = "README.md"
+license = "MIT"
+repository = "https://github.com/PaddlePaddle/PaddleHelix/blob/dev/apps/protein_folding/helixfold3"
+classifiers = [
+    "Topic :: Scientific/Engineering :: Biochemistry",
+    "Topic :: Scientific/Engineering :: Protein Engineering"
+]
+
+
+packages = [
+    { include = "helixfold" },
+    { include = "helixfold/*.py" },
+]
+
+
+[tool.poetry.dependencies]
+python = "^3.8" 
+
+absl-py = "0.13.0"
+biopython = "1.79"
+chex = "0.0.7"
+dm-haiku = "0.0.4"
+dm-tree = "0.1.6"
+docker = "5.0.0"
+immutabledict = "2.0.0"
+jax = "0.2.14"
+ml-collections = "0.1.0"
+pandas = "1.3.4"
+scipy = "1.9.0"
+rdkit-pypi = "2022.9.5"
+posebusters = "*" 
+hydra-core= "^1.3.2"
+omegaconf = "^2.3.0"
+
+
+
+[tool.poetry.scripts]
+helixfold = 'helixfold.inference:main'
diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh
new file mode 100644
index 00000000..30f008d6
--- /dev/null
+++ b/apps/protein_folding/helixfold3/setup_env.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+ENV_NAME='helixfold'
+CUDA=12.0
+
+# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit
+
+# Install py env
+conda create -n ${ENV_NAME} -y -c conda-forge  pip  python=3.9;
+source activate ${ENV_NAME}
+conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
+
+conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
+conda install -y -c conda-forge openbabel
+
+python -m pip install --upgrade 'pip<24';pip install .  --no-cache-dir
+
+pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl

From b4fce8af392b03b030610d810d878d65fee5b79b Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 10:52:13 +0800
Subject: [PATCH 02/42] use reduced bfd

---
 .../helixfold3/helixfold/config/helixfold.yaml              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index fd70ada0..60e8076d 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -29,7 +29,7 @@ bin:
   hmmsearch: null  # Corresponds to --hmmsearch_binary_path
   hmmbuild: null  # Corresponds to --hmmbuild_binary_path
   nhmmer: null  # Corresponds to --nhmmer_binary_path
-  obabel: null
+  obabel: null  # Inject to env as OBABEL_BIN
 
 # Database paths
 db:
@@ -38,7 +38,7 @@ db:
   uniref90: /mnt/db/uniref90/uniref90.fasta  # Corresponds to --uniref90_database_path, required field
   mgnify: /mnt/db/mgnify/mgy_clusters.fa  # Corresponds to --mgnify_database_path, required field
   bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt  # Corresponds to --bfd_database_path
-  small_bfd: null  # Corresponds to --small_bfd_database_path
+  small_bfd: /mnt/db/reduced_bfd/bfd-first_non_consensus_sequences.fasta  # Corresponds to --small_bfd_database_path
   uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02  # Corresponds to --uniclust30_database_path
   rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta  # Corresponds to --rfam_database_path, required field
   ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz  # Corresponds to --ccd_preprocessed_path, required field
@@ -51,7 +51,7 @@ template:
 
 # Preset configuration
 preset:
-  preset: full_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
+  preset: reduced_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
 
 # Other configurations
 other:

From d2aa1b9fdcd35e48955c87f51c33bfa221a895d2 Mon Sep 17 00:00:00 2001
From: Ryan Garcia <47666442+RyanGarciaLI@users.noreply.github.com>
Date: Thu, 15 Aug 2024 21:20:39 +0800
Subject: [PATCH 03/42] disable no MSA mode (#312)


From d52830086c4200a77abcaa11ee927475420f6f2c Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:37:57 +0800
Subject: [PATCH 04/42] add override flag

---
 .../helixfold3/helixfold/config/helixfold.yaml               | 2 +-
 apps/protein_folding/helixfold3/helixfold/inference.py       | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 60e8076d..3428b324 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -18,6 +18,7 @@ use_small_bfd: false # Corresponds to --use_small_bfd
 
 input: null  # Corresponds to --input_json, required field
 output: null  # Corresponds to --output_dir, required field
+override: false # Set true to override existing msa output directory
 
 
 # Binary tool paths, leave them as null to find proper ones under PATH or conda bin path
@@ -56,4 +57,3 @@ preset:
 # Other configurations
 other:
   maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
-  no_msa_templ_feats: false  # Corresponds to --no_msa_templ_feats
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index b3fbf745..0105a1e3 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -487,7 +487,7 @@ def main(cfg: DictConfig):
         assert cfg.db.uniclust30 is not None
 
     logger.info('Getting MSA/Template Pipelines...')
-    msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args)
+    msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg)
         
     ### Create model
     model_config = config.model_config(cfg.job_id)
@@ -515,8 +515,9 @@ def main(cfg: DictConfig):
     msa_output_dir.mkdir(parents=True, exist_ok=True)
 
     features_pkl = output_dir_base.joinpath('final_features.pkl')
-    if features_pkl.exists():
+    if features_pkl.exists() and not cfg.override:
         with open(features_pkl, 'rb') as f:
+            logging.info(f'Load features from precomputed {features_pkl}')
             feature_dict = pickle.load(f)
     else:
         feature_dict = feature_processing_aa.process_input_json(

From a3d62e2e6b0ae6ba7cd141f56ab9d086bec897dd Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 15:09:33 +0800
Subject: [PATCH 05/42] add configs

---
 .../helixfold/config/helixfold.yaml           | 12 ++++++++
 .../helixfold3/helixfold/model/config.py      | 29 ++++++++++++++++---
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 3428b324..12ae893e 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -57,3 +57,15 @@ preset:
 # Other configurations
 other:
   maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
+
+
+# CONFIG_DIFFS for advanced configuration
+CONFIG_DIFFS:
+  preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1']
+  model:
+    global_config:
+      subbatch_size: 96 # model.global_config.subbatch_size
+  num_recycle: 3 # model.num_recycle
+  heads:
+    confidence_head:
+      weight: 0.0 # model.heads.confidence_head.weight
diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py
index 6da8566a..2eafaaa6 100644
--- a/apps/protein_folding/helixfold3/helixfold/model/config.py
+++ b/apps/protein_folding/helixfold3/helixfold/model/config.py
@@ -15,7 +15,9 @@
 """Model config."""
 
 import copy
+from typing import Any, Union
 import ml_collections
+from omegaconf import DictConfig
 
 
 NUM_RES = 'num residues placeholder'
@@ -24,16 +26,35 @@
 NUM_TEMPLATES = 'num templates placeholder'
 
 
-def model_config(name: str) -> ml_collections.ConfigDict:
+def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> ml_collections.ConfigDict:
   """Get the ConfigDict of a model."""
 
   cfg = copy.deepcopy(CONFIG_ALLATOM)
-  if name in CONFIG_DIFFS:
-    cfg.update_from_flattened_dict(CONFIG_DIFFS[name])
+  if config_diffs is None or config_diffs=='':
+    # early return if nothing is changed
+    return cfg
 
-  return cfg
+  if isinstance(config_diffs, DictConfig):
+    if 'preset' in config_diffs and (preset_name:=config_diffs['preset']) in CONFIG_DIFFS:
+      updated_config=CONFIG_DIFFS[preset_name]
+      cfg.update_from_flattened_dict(updated_config)
+      print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}')
 
+      return cfg
+    if 'model' in config_diffs and (updated_config := config_diffs['model']) is not None:
+      cfg.update(dict(updated_config))
+      print(f'Updated config from `CONFIG_DIFFS`: {updated_config}')
 
+      return cfg
+  
+  raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}')
+    
+  
+  
+
+  
+
+# preset for runs
 CONFIG_DIFFS = {
     'allatom_demo': {
         'model.heads.confidence_head.weight': 0.01

From 6e39caa25802b72cc8d11d111a85094e8dc57cd7 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 15:29:02 +0800
Subject: [PATCH 06/42] fix:config

---
 .../helixfold3/helixfold/config/helixfold.yaml    |  8 ++++----
 .../helixfold3/helixfold/inference.py             |  3 +--
 .../helixfold3/helixfold/model/config.py          | 15 +++++++--------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 12ae893e..561108fa 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -65,7 +65,7 @@ CONFIG_DIFFS:
   model:
     global_config:
       subbatch_size: 96 # model.global_config.subbatch_size
-  num_recycle: 3 # model.num_recycle
-  heads:
-    confidence_head:
-      weight: 0.0 # model.heads.confidence_head.weight
+    num_recycle: 3 # model.num_recycle
+    heads:
+      confidence_head:
+        weight: 0.0 # model.heads.confidence_head.weight
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 0105a1e3..5ca95e3b 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -490,8 +490,7 @@ def main(cfg: DictConfig):
     msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg)
         
     ### Create model
-    model_config = config.model_config(cfg.job_id)
-    #print(f'>>> model_config:\n{model_config}')
+    model_config = config.model_config(cfg.CONFIG_DIFFS)
 
     model = RunModel(model_config)
 
diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py
index 2eafaaa6..c4682b29 100644
--- a/apps/protein_folding/helixfold3/helixfold/model/config.py
+++ b/apps/protein_folding/helixfold3/helixfold/model/config.py
@@ -41,18 +41,17 @@ def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]
       print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}')
 
       return cfg
-    if 'model' in config_diffs and (updated_config := config_diffs['model']) is not None:
-      cfg.update(dict(updated_config))
-      print(f'Updated config from `CONFIG_DIFFS`: {updated_config}')
-
+    
+    # update from detailed configuration
+    if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM):
+      for root_kw in CONFIG_ALLATOM:
+        if root_kw in config_diffs and (updated_config := config_diffs[root_kw]) is not None:
+          cfg.update(dict(updated_config))
+          print(f'Updated config from `CONFIG_DIFFS`: {updated_config}')
       return cfg
   
   raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}')
     
-  
-  
-
-  
 
 # preset for runs
 CONFIG_DIFFS = {

From 29ef9c49241b3badd5851f93623ad83b11666bda Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 16:05:58 +0800
Subject: [PATCH 07/42] fix: maxit path

---
 .../helixfold/common/all_atom_pdb_save.py     |  2 +-
 .../helixfold/config/helixfold.yaml           |  1 -
 .../helixfold3/helixfold/inference.py         | 28 +++++++++----------
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index 9c9f288e..8d541bd7 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -164,7 +164,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
     - maxit_binary: path to maxit_binary, use to convert pdb to cif
     - mmcif_path: path to save *.cif
   """
-  if os.path.isfile(maxit_binary):
+  if not os.path.isfile(maxit_binary):
     raise FileNotFoundError(
       f'maxit_binary: {maxit_binary} not exists. '
       f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html')
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 561108fa..623a3561 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -6,7 +6,6 @@ defaults:
 bf16_infer: false  # Corresponds to --bf16_infer
 seed: null  # Corresponds to --seed
 logging_level: DEBUG  # Corresponds to --logging_level
-job_id: 'structure_prediction'  # Corresponds to --model_name
 weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams  # Corresponds to --init_model
 precision: fp32  # Corresponds to --precision
 amp_level: O1  # Corresponds to --amp_level
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 5ca95e3b..97ae415c 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -193,29 +193,27 @@ def ranking_all_predictions(output_dirs):
         rank_id += 1
 
 @paddle.no_grad()
-def eval(cfg: DictConfig, model:RunModel, batch):
-    """Evaluate a given dataset"""
+def eval(args, model, batch):
+    """evaluate a given dataset"""
     model.eval()       
         
-    # Inference
+    # inference
     def _forward_with_precision(batch):
-        precision=cfg.precision
-        if precision not in ('bf16','fp32',):
-            raise ValueError("Please choose precision from bf16 and fp32!")
-
-        if cfg.precision == "bf16" or cfg.bf16_infer:
+        if args.precision == "bf16" or args.bf16_infer:
             black_list, white_list = get_custom_amp_list()
             with paddle.amp.auto_cast(enable=True,
-                                      custom_white_list=white_list, 
-                                      custom_black_list=black_list, 
-                                      level=cfg.amp_level, 
-                                      dtype='bfloat16'):
+                                        custom_white_list=white_list, 
+                                        custom_black_list=black_list, 
+                                        level=args.amp_level, 
+                                        dtype='bfloat16'):
                 return model(batch, compute_loss=False)
-
-        return model(batch, compute_loss=False)
+        elif args.precision == "fp32":
+            return model(batch, compute_loss=False)
+        else:
+            raise ValueError("Please choose precision from bf16 and fp32! ")
         
     res = _forward_with_precision(batch)
-    logger.info("Inference Succeeds...\n")
+    logger.info(f"Inference Succeeds...\n")
     return res
 
 
From 6ef4cd89be76ac48f4060569501d408f57bf530d Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 18:26:57 +0800
Subject: [PATCH 08/42] fix: maxit run with env

---
 .../helixfold/common/all_atom_pdb_save.py     | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index 8d541bd7..f15f956d 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -21,6 +21,7 @@
 import paddle
 import itertools
 import os
+import subprocess
 
 FeatureDict = Mapping[str, np.ndarray]
 ModelOutput = Mapping[str, Any]  # Is a nested dict.
@@ -174,7 +175,27 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
 
   pdb_path = mmcif_path.replace('.cif', '.pdb')
   pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path)
-  msg = os.system(f'{maxit_binary} -i {pdb_path} -o 1 -output {mmcif_path}')
-  if msg != 0:
-    print(f'convert pdb to cif failed, error message: {msg}')
+
+  cmd=[maxit_binary,
+       '-i', pdb_path,
+       '-o', '1',
+       '-output', mmcif_path,
+       ]
+  
+  print('Launching subprocess "%s"', ' '.join(cmd))
+
+  process = subprocess.Popen(
+      ' '.join(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
+
+
+  stdout, stderr = process.communicate()
+  retcode = process.wait()
+
+
+  if retcode:
+    # Logs have a 15k character limit, so log HHblits error line by line.
+    print('maxit failed. HHblits stderr begin:')
+    raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
+        stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
+
   return mmcif_path
\ No newline at end of file

From 5568284d92808865e27f5fe04632dff4f9ce4cab Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 18:29:16 +0800
Subject: [PATCH 09/42] fix: maxit run with env

---
 .../helixfold3/helixfold/common/all_atom_pdb_save.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index f15f956d..92e7d225 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -185,7 +185,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
   print('Launching subprocess "%s"', ' '.join(cmd))
 
   process = subprocess.Popen(
-      ' '.join(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
+      cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
 
 
   stdout, stderr = process.communicate()

From ae9146a5b01fc352999b893fd9ee8a6de5ab9c3c Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 18:43:53 +0800
Subject: [PATCH 10/42] copy of license

---
 apps/protein_folding/helixfold3/FOLDING.md    |  7 ++++
 .../helixfold3/helixfold/LICENSE              | 36 +++++++++++++++++++
 .../helixfold/common/all_atom_pdb_save.py     | 12 +++++--
 .../helixfold3/helixfold/inference.py         |  2 +-
 4 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/FOLDING.md
 create mode 100644 apps/protein_folding/helixfold3/helixfold/LICENSE

diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md
new file mode 100644
index 00000000..17dc6ebc
--- /dev/null
+++ b/apps/protein_folding/helixfold3/FOLDING.md
@@ -0,0 +1,7 @@
+# FOLD GUIDE
+
+## Quick example
+```shell
+LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json  output=. CONFIG_DIFFS.preset=allatom_demo
+
+```
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/LICENSE b/apps/protein_folding/helixfold3/helixfold/LICENSE
new file mode 100644
index 00000000..fb84eeb3
--- /dev/null
+++ b/apps/protein_folding/helixfold3/helixfold/LICENSE
@@ -0,0 +1,36 @@
+# Terms of Use Agreement
+The HelixFold 3 open-source code and any derivative works are only available for non-commercial use by individuals and non-commercial organizations (universities, non-profit organizations and research institutes, educational and government bodies), or for journalism.
+
+
+# Usage Restrictions:
+The use of the HelixFold 3 open-source code is subject to the following conditions and restrictions:
+   1. Commercial Use Prohibited: The code and its outputs shall not be used by or for commercial entities, nor in any context related to commercial activities, including conducting research for commercial purposes or conferring any rights to commercial entities to use the outputs.
+
+   2. Integration with Automated Systems: The open-source code shall not be incorporated into any unauthorized automated systems, including, but not limited to, workflows for the screening or design of biomolecules.
+
+   3. Attribution Requirement: In any academic publication derived from the use of the open-source code, proper attribution to the original authors must be given in the manner specified, including the retention of the author's name, copyright notice, and a link to the license agreement, as well as a statement indicating whether modifications were made to the materials.
+
+   4. Share-Alike Obligation: If you modify or adapt the materials (e.g., through translation or rewriting), you must distribute your derivative works under the same "Attribution-NonCommercial-ShareAlike" license. This ensures that others may use your adapted work under identical terms.
+   
+   5. Prohibition on Illegal or Malicious Use: The code shall not be used to facilitate or engage in activities that are illegal, dangerous, or malicious.
+
+For commercial use, you must contact the authors to obtain a commercial license.\
+**Disclaimers：**
+
+The HelixFold 3 open-source code any derivative works are only for theoretical modelling. These are not intended, validated, or approved for clinical use.
+
+**Governing Law and Dispute Resolution:** \
+This agreement is governed by the laws of the People's Republic of China. In the event of a dispute arising from the execution of this agreement, the parties shall attempt to resolve it through amicable consultation. If consultation fails, either party may submit the dispute to the People's Court of Haidian District, Beijing for adjudication.
+
+**Termination:**
+   - **Automatic Termination:** The rights granted under this license will automatically terminate if you violate any of its terms. Rights may be reinstated upon rectification of the violation or with the licensor's express consent.
+
+   - **Surviving Provisions:** Certain provisions, including disclaimers and liability limitations, shall remain in effect even after the termination of this license.
+
+
+
+**Miscellaneous Provisions:**
+   - **No Additional Restrictions:** You may not impose any additional restrictions on the use of the materials, such as implementing technical protection measures (e.g., encryption), as doing so would contravene the spirit of this license.
+   - **Interpretation Authority:** Baidu reserves the right to make reasonable interpretations of this open-source service.
+
+Should any provision of this agreement be deemed invalid or unenforceable, the remaining provisions shall continue to be in full force and effect.
diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index 92e7d225..abc33ef6 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -184,6 +184,10 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
   
   print('Launching subprocess "%s"', ' '.join(cmd))
 
+  if os.path.exists('maxit.log'):
+     os.remove('maxit.log')
+    
+
   process = subprocess.Popen(
       cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
 
@@ -194,8 +198,10 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
 
   if retcode:
     # Logs have a 15k character limit, so log HHblits error line by line.
-    print('maxit failed. HHblits stderr begin:')
-    raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
-        stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
+    print('Maxit failed. Maxit stderr begin:')
+    raise RuntimeError(f'Maxit failed\nstdout:\n{stdout.decode("utf-8")}\n\n'
+                       f'stderr:\n{stderr[:500_000].decode("utf-8")}\n'
+                       f'logfile:\n{open("maxit.log", "r").read().strip() if os.path.isfile("maxit.log") else ""}\n'
+                       f'Env:\n{os.environ.copy()}')
 
   return mmcif_path
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 97ae415c..5a883094 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -461,7 +461,7 @@ def main(cfg: DictConfig):
     ## check maxit binary path
     maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit')
     
-    RCSBROOT=os.path.dirname(maxit_binary)
+    RCSBROOT=os.path.join(os.path.dirname(maxit_binary), '..')
     os.environ['RCSBROOT']=RCSBROOT
 
     ## check obabel

From 5aae769b0dd7b5064db7c436ec18b1ecd0e7badf Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 19:05:59 +0800
Subject: [PATCH 11/42] fix: obabel bin resolve

---
 apps/protein_folding/helixfold3/FOLDING.md             | 5 +++++
 apps/protein_folding/helixfold3/helixfold/inference.py | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md
index 17dc6ebc..80170f97 100644
--- a/apps/protein_folding/helixfold3/FOLDING.md
+++ b/apps/protein_folding/helixfold3/FOLDING.md
@@ -1,7 +1,12 @@
 # FOLD GUIDE
 
 ## Quick example
+Run from default config
 ```shell
 LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json  output=. CONFIG_DIFFS.preset=allatom_demo
+```
 
+Run with customized configuration dir and name:
+```shell
+LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json  output=. CONFIG_DIFFS.preset=allatom_demo
 ```
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 5a883094..d26240cf 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -457,7 +457,6 @@ def main(cfg: DictConfig):
     print(f'>>> FLAGS_new_einsum: {new_einsum}')
     print(f'>>> config:\n{cfg}')
 
-    all_entitys = preprocess_json_entity(cfg.input, cfg.output)
     ## check maxit binary path
     maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit')
     
@@ -468,6 +467,8 @@ def main(cfg: DictConfig):
     obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel')
     os.environ['OBABEL_BIN']=obabel_bin
 
+    all_entitys = preprocess_json_entity(cfg.input, cfg.output)
+    
     ### Set seed for reproducibility
     seed = cfg.seed
     if seed is None:

From b63e7220799c38a4d6b79920f606ad7e73fec2bf Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 19:07:15 +0800
Subject: [PATCH 12/42] doc: use conda prefix

---
 apps/protein_folding/helixfold3/FOLDING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md
index 80170f97..b658f016 100644
--- a/apps/protein_folding/helixfold3/FOLDING.md
+++ b/apps/protein_folding/helixfold3/FOLDING.md
@@ -3,10 +3,10 @@
 ## Quick example
 Run from default config
 ```shell
-LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json  output=. CONFIG_DIFFS.preset=allatom_demo
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json  output=. CONFIG_DIFFS.preset=allatom_demo
 ```
 
 Run with customized configuration dir and name:
 ```shell
-LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json  output=. CONFIG_DIFFS.preset=allatom_demo
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json  output=. CONFIG_DIFFS.preset=allatom_demo
 ```
\ No newline at end of file

From 0863ecfe298e5a44419e6e96c4bec8cab3b4c240 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 21:48:01 +0800
Subject: [PATCH 13/42] doc: merge to readme

---
 apps/protein_folding/helixfold3/FOLDING.md    | 12 ---
 apps/protein_folding/helixfold3/README.md     | 88 +++++++------------
 .../helixfold/config/helixfold.yaml           |  1 +
 .../helixfold3/helixfold/inference.py         |  3 +-
 .../helixfold3/requirements.txt               | 13 ---
 apps/protein_folding/helixfold3/run_infer.sh  | 39 --------
 apps/protein_folding/helixfold3/setup_env.sh  | 18 ----
 7 files changed, 37 insertions(+), 137 deletions(-)
 delete mode 100644 apps/protein_folding/helixfold3/FOLDING.md
 delete mode 100644 apps/protein_folding/helixfold3/requirements.txt
 delete mode 100644 apps/protein_folding/helixfold3/run_infer.sh
 delete mode 100644 apps/protein_folding/helixfold3/setup_env.sh

diff --git a/apps/protein_folding/helixfold3/FOLDING.md b/apps/protein_folding/helixfold3/FOLDING.md
deleted file mode 100644
index b658f016..00000000
--- a/apps/protein_folding/helixfold3/FOLDING.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# FOLD GUIDE
-
-## Quick example
-Run from default config
-```shell
-LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json  output=. CONFIG_DIFFS.preset=allatom_demo
-```
-
-Run with customized configuration dir and name:
-```shell
-LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH helixfold --config-dir=. --config-name=myfold input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json  output=. CONFIG_DIFFS.preset=allatom_demo
-```
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index 12ed7041..bdf4264e 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -44,17 +44,26 @@ Locate to the directory of `helixfold` then run:
 ```bash
 # Install py env
 conda create -n helixfold -c conda-forge python=3.9
-conda install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -n helixfold
-conda install -y -c conda-forge openbabel -n helixfold
 
 # activate the conda environment
 conda activate helixfold
 
+# adjust these version numbers as your situation
+conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
+conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
+conda install -y -c conda-forge openbabel
+
 # install paddlepaddle
-python3 -m pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
 # or lower version: https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl
 
-python3 -m pip install -r requirements.txt
+# downgrade pip
+pip install --upgrade 'pip<24'
+
+# edit configuration file at `/helixfold/config/helixfold.yaml` to set your databases and binaries correctly
+
+# install HF3 as a python library
+pip install .  --no-cache-dir
 ```
 
 Note: If you have a different version of python3 and cuda, please refer to [here](https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html) for the compatible PaddlePaddle `dev` package.
@@ -125,58 +134,29 @@ sh run_infer.sh
 ```
 
 The script is as follows,
-```bash
-#!/bin/bash
-
-PYTHON_BIN="PATH/TO/YOUR/PYTHON"
-ENV_BIN="PATH/TO/YOUR/ENV"
-MAXIT_SRC="PATH/TO/MAXIT/SRC"
-DATA_DIR="PATH/TO/DATA"
-export OBABEL_BIN="PATH/TO/OBABEL/BIN"
-export PATH="$MAXIT_BIN/bin:$PATH"
-
-CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
-    --maxit_binary "$MAXIT_SRC/bin/maxit" \
-    --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \
-	--hhblits_binary_path "$ENV_BIN/hhblits" \
-	--hhsearch_binary_path "$ENV_BIN/hhsearch" \
-	--kalign_binary_path "$ENV_BIN/kalign" \
-	--hmmsearch_binary_path "$ENV_BIN/hmmsearch" \
-	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
-    --nhmmer_binary_path "$ENV_BIN/nhmmer" \
-    --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
-    --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
-    --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
-    --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
-    --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
-    --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \
-    --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \
-    --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \
-    --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
-    --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
-    --max_template_date=2020-05-14 \
-    --input_json data/demo_protein_ligand.json \
-    --output_dir ./output \
-    --model_name allatom_demo \
-    --init_model ./init_models/checkpoints.pdparams \
-    --infer_times 3 \
-    --precision "fp32"
+##### Run from default config
+```shell
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
+helixfold \
+    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json \
+    output=. CONFIG_DIFFS.preset=allatom_demo
 ```
+
+##### Run with customized configuration dir and file(`./myfold.yaml`, for example):
+```shell
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
+helixfold --config-dir=. --config-name=myfold \
+    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json \
+    output=. CONFIG_DIFFS.preset=allatom_demo
+```
+
 The descriptions of the above script are as follows:
-* Replace `MAXIT_SRC` with your installed `maxit`'s root path.
-* Replace `DATA_DIR` with your downloaded data path.
-* Replace `OBABEL_BIN` with your installed `openbabel` path.
-* Replace `ENV_BIN` with your conda virtual environment or any environment where `hhblits`, `hmmsearch` and other dependencies have been installed.
-* `--preset` - Set `'reduced_dbs'` to use small bfd or `'full_dbs'` to use full bfd.
-* `--*_database_path` - Path to datasets you have downloaded.
-* `--input_json` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference.
-* `--output_dir` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
-* `--model_name` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
-* `--infer_time` - The number of inferences executed by model for single input. In each inference, the model will infer `5` times (`diff_batch_size`) for the same input by default. This hyperparameter can be changed by `model.head.diffusion_module.test_diff_batch_size` within `./helixfold/model/config.py`
-* `--precision` - Either `bf16` or `fp32`. Please check if your machine can support `bf16` or not beforing changing it. For example, `bf16` is supported by A100 and H100 or higher version while V100 only supports `fp32`.
+* `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.`
+* `config-dir` - The directory that contains the alterative configuration file you would like to use.
+* `config-name` - The name of the configuration file you would like to use.
+* `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference.
+* `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
+* `--CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
 
 ### Understanding Model Output
 
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 623a3561..262cc517 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -12,6 +12,7 @@ amp_level: O1  # Corresponds to --amp_level
 infer_times: 1  # Corresponds to --infer_times
 diff_batch_size: -1  # Corresponds to --diff_batch_size
 use_small_bfd: false # Corresponds to --use_small_bfd
+msa_only: false # Only process msa
 
 # File paths
 
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index d26240cf..0ab811b9 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -39,12 +39,13 @@
 from helixfold.data.tools import hmmsearch
 from helixfold.data import templates
 from helixfold.utils.utils import get_custom_amp_list
-from helixfold.utils.model import RunModel
 from helixfold.utils.misc import set_logging_level
 from typing import Dict
 from helixfold.infer_scripts import feature_processing_aa, preprocess
 from helixfold.infer_scripts.tools import mmcif_writer
 
+from apps.protein_folding.helixfold.alphafold_paddle.model.model import RunModel
+
 script_path=os.path.dirname(__file__)
 
 ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP
diff --git a/apps/protein_folding/helixfold3/requirements.txt b/apps/protein_folding/helixfold3/requirements.txt
deleted file mode 100644
index 660e43c1..00000000
--- a/apps/protein_folding/helixfold3/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-absl-py==0.13.0
-biopython==1.79
-chex==0.0.7
-dm-haiku==0.0.4
-dm-tree==0.1.6
-docker==5.0.0
-immutabledict==2.0.0
-jax==0.2.14
-ml-collections==0.1.0
-pandas==1.3.4
-scipy==1.9.0
-rdkit-pypi==2022.9.5 
-posebusters
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh
deleted file mode 100644
index 5b0644e5..00000000
--- a/apps/protein_folding/helixfold3/run_infer.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-PYTHON_BIN="/usr/bin/python3" # changes to your python
-ENV_BIN="/root/miniconda3/bin"  # change to your env
-MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT
-export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel
-DATA_DIR="./data"
-export PATH="$MAXIT_SRC/bin:$PATH"
-
-CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
-    --maxit_binary "$MAXIT_SRC/bin/maxit" \
-    --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \
-	--hhblits_binary_path "$ENV_BIN/hhblits" \
-	--hhsearch_binary_path "$ENV_BIN/hhsearch" \
-	--kalign_binary_path "$ENV_BIN/kalign" \
-	--hmmsearch_binary_path "$ENV_BIN/hmmsearch" \
-	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
-    --nhmmer_binary_path "$ENV_BIN/nhmmer" \
-    --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
-    --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
-    --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
-    --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
-    --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
-    --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \
-    --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \
-    --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \
-    --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
-    --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
-    --max_template_date=2020-05-14 \
-    --input_json data/demo_6zcy.json \
-    --output_dir ./output \
-    --model_name allatom_demo \
-    --init_model init_models/HelixFold3-240814.pdparams \
-    --infer_times 1 \
-    --diff_batch_size 1 \
-    --precision "fp32"
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh
deleted file mode 100644
index 30f008d6..00000000
--- a/apps/protein_folding/helixfold3/setup_env.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-ENV_NAME='helixfold'
-CUDA=12.0
-
-# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit
-
-# Install py env
-conda create -n ${ENV_NAME} -y -c conda-forge  pip  python=3.9;
-source activate ${ENV_NAME}
-conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
-
-conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
-conda install -y -c conda-forge openbabel
-
-python -m pip install --upgrade 'pip<24';pip install .  --no-cache-dir
-
-pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl

From 84a550cb770b1ca827d1f263dd605282d42e534d Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 22:01:36 +0800
Subject: [PATCH 14/42] fix: imports

---
 apps/protein_folding/helixfold3/README.md          |  3 +++
 .../helixfold3/helixfold/config/helixfold.yaml     | 14 +++++++-------
 .../helixfold3/helixfold/inference.py              |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index bdf4264e..c0bed52c 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -150,6 +150,8 @@ helixfold --config-dir=. --config-name=myfold \
     output=. CONFIG_DIFFS.preset=allatom_demo
 ```
 
+
+
 The descriptions of the above script are as follows:
 * `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.`
 * `config-dir` - The directory that contains the alterative configuration file you would like to use.
@@ -158,6 +160,7 @@ The descriptions of the above script are as follows:
 * `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
 * `--CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
 
+
 ### Understanding Model Output
 
 The outputs will be in a subfolder of `output_dir`, including the computed MSAs, predicted structures, 
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 262cc517..7de2fd7a 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -62,10 +62,10 @@ other:
 # CONFIG_DIFFS for advanced configuration
 CONFIG_DIFFS:
   preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1']
-  model:
-    global_config:
-      subbatch_size: 96 # model.global_config.subbatch_size
-    num_recycle: 3 # model.num_recycle
-    heads:
-      confidence_head:
-        weight: 0.0 # model.heads.confidence_head.weight
+  # model:
+    # global_config:
+    #   subbatch_size: 96 # model.global_config.subbatch_size
+    # num_recycle: 3 # model.num_recycle
+    # heads:
+    #   confidence_head:
+    #     weight: 0.0 # model.heads.confidence_head.weight
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 0ab811b9..0213b725 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -36,6 +36,7 @@
 from helixfold.data import pipeline_rna_parallel as pipeline_rna
 from helixfold.data import pipeline_rna_multimer
 from helixfold.data.utils import atom_level_keys, map_to_continuous_indices
+from helixfold.utils.model import RunModel
 from helixfold.data.tools import hmmsearch
 from helixfold.data import templates
 from helixfold.utils.utils import get_custom_amp_list
@@ -44,7 +45,6 @@
 from helixfold.infer_scripts import feature_processing_aa, preprocess
 from helixfold.infer_scripts.tools import mmcif_writer
 
-from apps.protein_folding.helixfold.alphafold_paddle.model.model import RunModel
 
 script_path=os.path.dirname(__file__)
 

From 21d41088f0d10ab7c1f1da5a658a2ff3b9a44dd0 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 22:40:49 +0800
Subject: [PATCH 15/42] fix: config overriden

---
 apps/protein_folding/helixfold3/README.md     | 14 +++++++--
 .../helixfold3/helixfold/inference.py         |  1 +
 .../helixfold3/helixfold/model/config.py      | 30 +++++++++----------
 3 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index c0bed52c..7c1ba474 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -150,7 +150,15 @@ helixfold --config-dir=. --config-name=myfold \
     output=. CONFIG_DIFFS.preset=allatom_demo
 ```
 
-
+##### Run with additional configuration term 
+```shell
+LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH \
+helixfold \
+    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy.json \
+    output=. \
+    CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \
+    CONFIG_DIFFS.model.global_config.subbatch_size=192
+```
 
 The descriptions of the above script are as follows:
 * `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.`
@@ -158,8 +166,8 @@ The descriptions of the above script are as follows:
 * `config-name` - The name of the configuration file you would like to use.
 * `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference.
 * `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
-* `--CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
-
+* `CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
+* `CONFIG_DIFFS.*` - Override model any configuration in `CONFIG_ALLATOM`.
 
 ### Understanding Model Output
 
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 0213b725..a326bbd0 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -491,6 +491,7 @@ def main(cfg: DictConfig):
         
     ### Create model
     model_config = config.model_config(cfg.CONFIG_DIFFS)
+    logging.warning(f'>>> Model config: \n{model_config}\n\n')
 
     model = RunModel(model_config)
 
diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py
index c4682b29..f6d2fe97 100644
--- a/apps/protein_folding/helixfold3/helixfold/model/config.py
+++ b/apps/protein_folding/helixfold3/helixfold/model/config.py
@@ -16,7 +16,6 @@
 
 import copy
 from typing import Any, Union
-import ml_collections
 from omegaconf import DictConfig
 
 
@@ -26,7 +25,7 @@
 NUM_TEMPLATES = 'num templates placeholder'
 
 
-def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> ml_collections.ConfigDict:
+def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> DictConfig:
   """Get the ConfigDict of a model."""
 
   cfg = copy.deepcopy(CONFIG_ALLATOM)
@@ -37,34 +36,33 @@ def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]
   if isinstance(config_diffs, DictConfig):
     if 'preset' in config_diffs and (preset_name:=config_diffs['preset']) in CONFIG_DIFFS:
       updated_config=CONFIG_DIFFS[preset_name]
-      cfg.update_from_flattened_dict(updated_config)
+      cfg.merge_with_dotlist(updated_config)
       print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}')
 
       return cfg
     
     # update from detailed configuration
     if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM):
-      for root_kw in CONFIG_ALLATOM:
-        if root_kw in config_diffs and (updated_config := config_diffs[root_kw]) is not None:
-          cfg.update(dict(updated_config))
-          print(f'Updated config from `CONFIG_DIFFS`: {updated_config}')
+
+      cfg.merge_with(config_diffs) # merge to override
+      print(f'Updated config from `CONFIG_DIFFS`: {config_diffs}')
       return cfg
   
   raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}')
     
 
 # preset for runs
-CONFIG_DIFFS = {
-    'allatom_demo': {
-        'model.heads.confidence_head.weight': 0.01
-    },
-    'allatom_subbatch_64_recycle_1': {
-        'model.global_config.subbatch_size': 64,
-        'model.num_recycle': 1,
-    },
+CONFIG_DIFFS: dict[str, list[str]] = {
+    'allatom_demo': [
+      'model.heads.confidence_head.weight=0.01'
+      ],
+    'allatom_subbatch_64_recycle_1': [
+        'model.global_config.subbatch_size=64',
+        'model.num_recycle=1',
+    ]
 }
 
-CONFIG_ALLATOM = ml_collections.ConfigDict({
+CONFIG_ALLATOM = DictConfig({
   'data': {   
     'num_blocks': 5,    # for msa block deletion
     'randomize_num_blocks': True,

From 75616744852b63312eff2446b3232fb600aafee5 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 23:06:06 +0800
Subject: [PATCH 16/42] doc: update for cli

---
 apps/protein_folding/helixfold3/README.md     | 25 +++++++++++--------
 .../helixfold/config/helixfold.yaml           |  2 ++
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index 7c1ba474..cc1a3109 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -36,7 +36,7 @@ Those settings are recommended as they are the same as we used in our A100 machi
 ### Installation
 
 HelixFold3 depends on [PaddlePaddle](https://github.com/paddlepaddle/paddle). Python dependencies available through `pip` 
-is provided in `requirements.txt`. `kalign`, the [`HH-suite`](https://github.com/soedinglab/hh-suite) and `jackhmmer` are 
+is provided with `pyproject.toml`. `kalign`, the [`HH-suite`](https://github.com/soedinglab/hh-suite) and `jackhmmer` are 
 also needed to produce multiple sequence alignments. The download scripts require `aria2c`. 
 
 Locate to the directory of `helixfold` then run:
@@ -50,7 +50,7 @@ conda activate helixfold
 
 # adjust these version numbers as your situation
 conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
-conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
+conda install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
 conda install -y -c conda-forge openbabel
 
 # install paddlepaddle
@@ -60,7 +60,7 @@ pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/w
 # downgrade pip
 pip install --upgrade 'pip<24'
 
-# edit configuration file at `/helixfold/config/helixfold.yaml` to set your databases and binaries correctly
+# edit configuration file at `./helixfold/config/helixfold.yaml` to set your databases and binaries correctly.
 
 # install HF3 as a python library
 pip install .  --no-cache-dir
@@ -138,26 +138,29 @@ The script is as follows,
 ```shell
 LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
 helixfold \
-    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json \
-    output=. CONFIG_DIFFS.preset=allatom_demo
+    input=./data/demo_8ecx.json \
+    output=. \
+    CONFIG_DIFFS.preset=allatom_demo
 ```
 
 ##### Run with customized configuration dir and file(`./myfold.yaml`, for example):
 ```shell
 LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
 helixfold --config-dir=. --config-name=myfold \
-    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json \
-    output=. CONFIG_DIFFS.preset=allatom_demo
+    input=./data/demo_6zcy_smiles.json \
+    output=. \
+    CONFIG_DIFFS.preset=allatom_demo
 ```
 
 ##### Run with additional configuration term 
 ```shell
-LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH \
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
 helixfold \
-    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy.json \
+    input=./data/demo_6zcy.json \
     output=. \
     CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \
-    CONFIG_DIFFS.model.global_config.subbatch_size=192
+    CONFIG_DIFFS.model.global_config.subbatch_size=192 \
+    CONFIG_DIFFS.model.num_recycle=10
 ```
 
 The descriptions of the above script are as follows:
@@ -166,7 +169,7 @@ The descriptions of the above script are as follows:
 * `config-name` - The name of the configuration file you would like to use.
 * `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference.
 * `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
-* `CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
+* `CONFIG_DIFFS.preset` - Adjusted model config preset name in `./helixfold/model/config.py:CONFIG_DIFFS`. The preset will be updated into final model configuration with `CONFIG_ALLATOM`.
 * `CONFIG_DIFFS.*` - Override model any configuration in `CONFIG_ALLATOM`.
 
 ### Understanding Model Output
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 7de2fd7a..0015b4d5 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -62,6 +62,8 @@ other:
 # CONFIG_DIFFS for advanced configuration
 CONFIG_DIFFS:
   preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1']
+
+  # Detailed configuration adjustments against `CONFIG_ALLATOM` can be used here. for example: 
   # model:
     # global_config:
     #   subbatch_size: 96 # model.global_config.subbatch_size

From e8582de19e05b467c366b5e3ef6438c3dd4a3da2 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 23:17:17 +0800
Subject: [PATCH 17/42] doc: update for cli

---
 apps/protein_folding/helixfold3/README.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index cc1a3109..c8fc9c8c 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -128,12 +128,7 @@ A example of input data is as follows:
 
 #### Running HelixFold for Inference
 To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.:
-* Inference on single GPU (change the settings in script BEFORE you run it)
-```
-sh run_infer.sh
-```
 
-The script is as follows,
 ##### Run from default config
 ```shell
 LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \

From cd924293fd7782da0bf70459b3bdfebfed6761c5 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sun, 18 Aug 2024 13:24:37 +0800
Subject: [PATCH 18/42] fix: msa parallel for bfd

---
 .../helixfold/common/all_atom_pdb_save.py     |   4 +-
 .../helixfold/config/helixfold.yaml           |   6 +-
 .../helixfold/data/pipeline_parallel.py       | 161 +++++++++---------
 .../helixfold3/helixfold/inference.py         |   4 +-
 .../protein_folding/helixfold3/pyproject.toml |   1 +
 5 files changed, 89 insertions(+), 87 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index abc33ef6..50b2500a 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -182,7 +182,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
        '-output', mmcif_path,
        ]
   
-  print('Launching subprocess "%s"', ' '.join(cmd))
+  print(f'Launching subprocess "{" " .join(cmd)}"', )
 
   if os.path.exists('maxit.log'):
      os.remove('maxit.log')
@@ -197,7 +197,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
 
 
   if retcode:
-    # Logs have a 15k character limit, so log HHblits error line by line.
+    # Logs have a 15k character limit, so log Maxit error line by line.
     print('Maxit failed. Maxit stderr begin:')
     raise RuntimeError(f'Maxit failed\nstdout:\n{stdout.decode("utf-8")}\n\n'
                        f'stderr:\n{stderr[:500_000].decode("utf-8")}\n'
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 0015b4d5..5f51d2be 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -14,6 +14,10 @@ diff_batch_size: -1  # Corresponds to --diff_batch_size
 use_small_bfd: false # Corresponds to --use_small_bfd
 msa_only: false # Only process msa
 
+nproc_msa: 
+  hhblits: 16
+  jackhmmer: 8
+
 # File paths
 
 input: null  # Corresponds to --input_json, required field
@@ -52,7 +56,7 @@ template:
 
 # Preset configuration
 preset:
-  preset: reduced_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
+  preset: full_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
 
 # Other configurations
 other:
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
index 3ca52693..a1467d2d 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
@@ -15,7 +15,7 @@
 """Functions for building the input features for the HelixFold model."""
 
 import os
-from typing import Any, Mapping, MutableMapping, Optional, Sequence, Union
+from typing import Any, Mapping, MutableMapping, Optional, Protocol, Sequence, Tuple, Union
 from absl import logging
 from helixfold.common import residue_constants
 from helixfold.data import msa_identifiers
@@ -26,12 +26,16 @@
 from helixfold.data.tools import hmmsearch
 from helixfold.data.tools import jackhmmer
 import numpy as np
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from joblib import Parallel, delayed
 # Internal import (7716).
 
 FeatureDict = MutableMapping[str, np.ndarray]
 TemplateSearcher = Union[hhsearch.HHSearch, hmmsearch.Hmmsearch]
 
+class MsaRunner(Protocol):
+    def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
+        """Runs the MSA tool on the input fasta file."""
+        ...
 
 def make_sequence_features(
     sequence: str, description: str, num_res: int) -> FeatureDict:
@@ -83,42 +87,41 @@ def make_msa_features(msas: Sequence[parsers.Msa]) -> FeatureDict:
   features['msa_species_identifiers'] = np.array(species_ids, dtype=np.object_)
   return features
 
-
-def run_msa_tool(msa_runner, input_fasta_path: str, msa_out_path: str,
-                 msa_format: str, use_precomputed_msas: bool,
-                 max_sto_sequences: Optional[int] = None
-                 ) -> Mapping[str, Any]:
-  """Runs an MSA tool, checking if output already exists first."""
-  if not use_precomputed_msas or not os.path.exists(msa_out_path):
-    if msa_format == 'sto' and max_sto_sequences is not None:
-      print('pipeline:',input_fasta_path,max_sto_sequences)
-      result = msa_runner.query(input_fasta_path, max_sto_sequences)[0]  # pytype: disable=wrong-arg-count
+# Yinying edited here to change various position args as one tuple for multiprocess tests
+def run_msa_tool(args: Tuple[MsaRunner, str, str, str, bool, int]) -> Mapping[str, Any]:
+    if args == None:
+        return None
+    if (len_args:=len(args))!=6:
+       raise ValueError(f'MsaRunner must have exactly 6 arguments but got {len_args}')
+
+    (msa_runner, input_fasta_path, msa_out_path,
+      msa_format, use_precomputed_msas,
+      max_sto_sequences) = args
+    #print(args)
+    """Runs an MSA tool, checking if output already exists first."""
+    if not use_precomputed_msas or not os.path.exists(msa_out_path):
+        if msa_format == 'sto' and max_sto_sequences > 0:
+            result = msa_runner.query(input_fasta_path, max_sto_sequences)[0]  # pytype: disable=wrong-arg-count
+        else:
+            result = msa_runner.query(input_fasta_path)[0]
+        with open(msa_out_path, 'w') as f:
+            f.write(result[msa_format])
     else:
-      result = msa_runner.query(input_fasta_path)[0]
-    with open(msa_out_path, 'w') as f:
-      f.write(result[msa_format])
-  else:
+        result=read_msa_result(msa_out_path,msa_format,max_sto_sequences)
+    return result
+
+def read_msa_result(msa_out_path,msa_format,max_sto_sequences):
     logging.warning('Reading MSA from file %s', msa_out_path)
-    if msa_format == 'sto' and max_sto_sequences is not None:
-      precomputed_msa = parsers.truncate_stockholm_msa(
-          msa_out_path, max_sto_sequences)
-      result = {'sto': precomputed_msa}
+    if msa_format == 'sto' and max_sto_sequences > 0:
+        precomputed_msa = parsers.truncate_stockholm_msa(
+            msa_out_path, max_sto_sequences)
+        result = {'sto': precomputed_msa}
     else:
-      with open(msa_out_path, 'r') as f:
-        result = {msa_format: f.read()}
-  return result
-
-def run_msa_tool_wrapper(args):
-    """
-    用于包装run_msa_tool函数的帮助程序，以便在使用argparse时可以更轻松地传递参数。
-    
-    Args:
-        args (tuple, list): 一个元组或列表，其中包含要传递给run_msa_tool函数的参数。
-    
-    Returns:
-        int: 返回run_msa_tool函数的返回值。
-    """
-    return run_msa_tool(*args)
+        with open(msa_out_path, 'r') as f:
+            result = {msa_format: f.read()}
+    return result
+
+
 
 
 class DataPipeline:
@@ -138,29 +141,38 @@ def __init__(self,
                use_small_bfd: bool,
                mgnify_max_hits: int = 501,
                uniref_max_hits: int = 10000,
-               use_precomputed_msas: bool = False):
+               use_precomputed_msas: bool = False,
+               nprocs: Mapping[str, int] = {
+                  'hhblits': 16,
+                  'jackhmmer': 8,
+               }):
     """Initializes the data pipeline. Constructs a feature dict for a given FASTA file."""
     self._use_small_bfd = use_small_bfd
+    self.nprocs=nprocs
     self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
         binary_path=jackhmmer_binary_path,
-        database_path=uniref90_database_path)
+        database_path=uniref90_database_path, n_cpu=self.nprocs.get('jackhmmer', 8))
     if use_small_bfd:
-      self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
+      self.bfd_runner = jackhmmer.Jackhmmer(
           binary_path=jackhmmer_binary_path,
-          database_path=small_bfd_database_path)
+          database_path=small_bfd_database_path, n_cpu=self.nprocs.get('jackhmmer', 8))
     else:
-      self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
+      self.bfd_runner = hhblits.HHBlits(
           binary_path=hhblits_binary_path,
-          databases=[bfd_database_path, uniclust30_database_path])
+          databases=[bfd_database_path, uniclust30_database_path], n_cpu=self.nprocs.get('hhblits', 8))
     self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
         binary_path=jackhmmer_binary_path,
-        database_path=mgnify_database_path)
+        database_path=mgnify_database_path, n_cpu=self.nprocs.get('jackhmmer', 8))
     self.template_searcher = template_searcher
     self.template_featurizer = template_featurizer
     self.mgnify_max_hits = mgnify_max_hits
     self.uniref_max_hits = uniref_max_hits
     self.use_precomputed_msas = use_precomputed_msas
 
+  def parallel_msa_joblib(self, func, input_args: list):
+    return Parallel(len(input_args))(delayed(func)(args) for args in input_args)
+
+
   def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
     """Runs alignment tools on the input sequence and creates features."""
     with open(input_fasta_path) as f:
@@ -202,44 +214,27 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
            input_fasta_path,                                                                                                                    
            os.path.join(msa_output_dir, 'mgnify_hits.sto'),                                                                                  
            'sto',                                                                                                                               
-           self.use_precomputed_msas))
+           self.use_precomputed_msas,
+           self.mgnify_max_hits))
 
-    if self._use_small_bfd:
-      msa_tasks.append((
-          self.jackhmmer_small_bfd_runner,
-          input_fasta_path,
-          os.path.join(msa_output_dir, 'small_bfd_hits.sto'),
-          'sto',
-          self.use_precomputed_msas))
-    else:
-      msa_tasks.append((
-          self.hhblits_bfd_uniclust_runner,
-          input_fasta_path,
-          os.path.join(msa_output_dir, 'bfd_uniclust_hits.a3m'),
-          'a3m',
-          self.use_precomputed_msas))
-
-    msa_results = {}  
-    with ProcessPoolExecutor() as executor:
-      futures = {executor.submit(run_msa_tool_wrapper, msa_task): msa_task for msa_task in msa_tasks}
-  
-      for future in as_completed(futures):
-        task = futures[future]
-        try:
-          result = future.result()
-          if 'uniref90_hits.sto' in task[2]:
-              msa_results['uniref90'] = result
-          elif 'mgnify_hits.sto' in task[2]:
-              msa_results['mgnify'] = result
-          elif 'small_bfd_hits.sto' in task[2]:
-              msa_results['small_bfd'] = result
-          elif 'bfd_uniclust_hits.a3m' in task[2]:
-              msa_results['bfd_uniclust'] = result
-
-        except Exception as exc:
-          print(f'Task {task} generated an exception : {exc}')
-
-    msa_for_templates = msa_results['uniref90']['sto']
+
+    msa_tasks.append((
+        self.bfd_runner,
+        input_fasta_path,
+        os.path.join(msa_output_dir, 'small_bfd_hits.sto' if self._use_small_bfd else 'bfd_uniclust_hits.a3m'),
+        'sto' if self._use_small_bfd else 'a3m',
+        self.use_precomputed_msas,
+        0))
+
+
+    [
+        jackhmmer_uniref90_result,
+        jackhmmer_mgnify_result,
+        bfd_result,
+    ] = self.parallel_msa_joblib(func=run_msa_tool,
+                          input_args=msa_tasks)
+
+    msa_for_templates = jackhmmer_uniref90_result['sto']
     msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
     msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(msa_for_templates)
 
@@ -257,16 +252,16 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
     with open(pdb_hits_out_path, 'w') as f:
       f.write(pdb_templates_result)
 
-    uniref90_msa = parsers.parse_stockholm(msa_results['uniref90']['sto'])
-    mgnify_msa = parsers.parse_stockholm(msa_results['mgnify']['sto'])
+    uniref90_msa = parsers.parse_stockholm(jackhmmer_uniref90_result['sto'])
+    mgnify_msa = parsers.parse_stockholm(jackhmmer_mgnify_result['sto'])
 
     pdb_template_hits = self.template_searcher.get_template_hits(
         output_string=pdb_templates_result, input_sequence=input_sequence)
 
     if self._use_small_bfd:
-        bfd_msa = parsers.parse_stockholm(msa_results['small_bfd']['sto'])
+        bfd_msa = parsers.parse_stockholm(bfd_result['sto'])
     else:
-        raise ValueError("Doesn't support full BFD yet.")
+        bfd_msa = parsers.parse_a3m(bfd_result['a3m'])
 
     templates_result = self.template_featurizer.get_templates(
         query_sequence=input_sequence,
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index a326bbd0..3e028d1d 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -152,7 +152,9 @@ def get_msa_templates_pipeline(cfg: DictConfig) -> Dict:
         template_searcher=template_searcher,
         template_featurizer=template_featurizer,
         use_small_bfd=cfg.use_small_bfd,
-        use_precomputed_msas=use_precomputed_msas)
+        use_precomputed_msas=use_precomputed_msas,
+        nprocs=cfg.nproc_msa,
+        )
 
     prot_data_pipeline = pipeline_multimer.DataPipeline(
         monomer_data_pipeline=monomer_data_pipeline,
diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml
index bc988ca9..8c6dc9f7 100644
--- a/apps/protein_folding/helixfold3/pyproject.toml
+++ b/apps/protein_folding/helixfold3/pyproject.toml
@@ -41,6 +41,7 @@ rdkit-pypi = "2022.9.5"
 posebusters = "*" 
 hydra-core= "^1.3.2"
 omegaconf = "^2.3.0"
+joblib = "1.4.2"
 
 
From 8c22e155cbb768e4434b20bc82bc2920391ffdc3 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sun, 18 Aug 2024 14:02:33 +0800
Subject: [PATCH 19/42] fix: multimer msa parallel

---
 .../helixfold/config/helixfold.yaml           |  4 +--
 .../data/pipeline_multimer_parallel.py        | 33 ++++++++++++++-----
 .../helixfold/data/pipeline_parallel.py       | 17 ++++++++--
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 5f51d2be..bc163019 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -15,8 +15,8 @@ use_small_bfd: false # Corresponds to --use_small_bfd
 msa_only: false # Only process msa
 
 nproc_msa: 
-  hhblits: 16
-  jackhmmer: 8
+  hhblits: 16 # Number of processors used by hhblits
+  jackhmmer: 8 # Number of processors used by jackhmmer
 
 # File paths
 
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py
index 9d595a07..251503aa 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_multimer_parallel.py
@@ -14,7 +14,8 @@
 from helixfold.data import feature_processing
 from helixfold.data import msa_pairing
 from helixfold.data import parsers
-from helixfold.data import pipeline
+#from helixfold.data import pipeline
+from helixfold.data import pipeline_parallel as pipeline
 from helixfold.data.tools import jackhmmer
 import numpy as np
 import multiprocessing
@@ -197,24 +198,38 @@ def _process_single_chain(
     with temp_fasta_file(chain_fasta_str) as chain_fasta_path:
       logging.info('Running monomer pipeline on chain %s: %s',
                    chain_id, description)
+
+      # We only construct the pairing features if there are 2 or more unique
+      # sequences.
+      self.jackhmmer_uniprot_args=(
+          self._uniprot_msa_runner,
+          str(chain_fasta_path),
+          os.path.join(chain_msa_output_dir, 'uniprot_hits.sto'),
+          'sto',
+          self.use_precomputed_msas,
+          0
+      )
+      
       chain_features = self._monomer_data_pipeline.process(
           input_fasta_path=chain_fasta_path,
-          msa_output_dir=chain_msa_output_dir)
+          msa_output_dir=chain_msa_output_dir,
+          other_args=self.jackhmmer_uniprot_args if not is_homomer_or_monomer else None)
 
       # We only construct the pairing features if there are 2 or more unique
       # sequences.
       if not is_homomer_or_monomer:
-        all_seq_msa_features = self._all_seq_msa_features(chain_fasta_path,
-                                                          chain_msa_output_dir)
+        all_seq_msa_features = self._all_seq_msa_features(chain_msa_output_dir)
         chain_features.update(all_seq_msa_features)
     return chain_features
 
-  def _all_seq_msa_features(self, input_fasta_path, msa_output_dir):
+  def _all_seq_msa_features(self, msa_output_dir):
     """Get MSA features for unclustered uniprot, for pairing."""
-    out_path = os.path.join(msa_output_dir, 'uniprot_hits.sto')
-    result = pipeline.run_msa_tool(
-        self._uniprot_msa_runner, input_fasta_path, out_path, 'sto',
-        self.use_precomputed_msas)
+    # edited by yinying to adapt to the multiprocess version of run_msa_tool function
+    result = pipeline.read_msa_result(
+        msa_out_path=os.path.join(msa_output_dir, 'uniprot_hits.sto'),
+        msa_format='sto',
+        max_sto_sequences=0
+    )
     msa = parsers.parse_stockholm(result['sto'])
     msa = msa.truncate(max_seqs=self._max_uniprot_hits)
     all_seq_features = pipeline.make_msa_features([msa])
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
index a1467d2d..53bb0445 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
@@ -33,10 +33,17 @@
 TemplateSearcher = Union[hhsearch.HHSearch, hmmsearch.Hmmsearch]
 
 class MsaRunner(Protocol):
+    n_cpu: int
+    
     def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
         """Runs the MSA tool on the input fasta file."""
         ...
 
+def check_used_ncpus(used: list[int]):
+   ncpus_sum=sum(used)
+   if ncpus_sum >os.cpu_count():
+      logging.warning(f"The number of used CPUs({ncpus_sum}) is larger than the number of available CPUs({os.cpu_count()}).")
+
 def make_sequence_features(
     sequence: str, description: str, num_res: int) -> FeatureDict:
   """Constructs a feature dict of sequence features."""
@@ -170,10 +177,10 @@ def __init__(self,
     self.use_precomputed_msas = use_precomputed_msas
 
   def parallel_msa_joblib(self, func, input_args: list):
-    return Parallel(len(input_args))(delayed(func)(args) for args in input_args)
+    return Parallel(len(input_args),verbose=100)(delayed(func)(args) for args in input_args)
 
 
-  def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
+  def process(self, input_fasta_path: str, msa_output_dir: str,other_args: Optional[tuple] = None) -> FeatureDict:
     """Runs alignment tools on the input sequence and creates features."""
     with open(input_fasta_path) as f:
       input_fasta_str = f.read()
@@ -186,7 +193,7 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
     num_res = len(input_sequence)
 
 
-    msa_tasks = []
+    msa_tasks: list[Tuple[MsaRunner, str, str, str, bool, int]] = []
     """uniref90_out_path = os.path.join(msa_output_dir, 'uniref90_hits.sto')
     jackhmmer_uniref90_result = run_msa_tool(
         msa_runner=self.jackhmmer_uniref90_runner,
@@ -225,12 +232,16 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
         'sto' if self._use_small_bfd else 'a3m',
         self.use_precomputed_msas,
         0))
+    
+    msa_tasks.append(other_args)
 
+    check_used_ncpus(used=[mask[0].n_cpu for mask in msa_tasks if hasattr(mask[0], 'n_cpu')])
 
     [
         jackhmmer_uniref90_result,
         jackhmmer_mgnify_result,
         bfd_result,
+        
     ] = self.parallel_msa_joblib(func=run_msa_tool,
                           input_args=msa_tasks)
 

From 8ce13ca2262983f93af0f89d7e3d2bd39d50dbb5 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sun, 18 Aug 2024 14:07:54 +0800
Subject: [PATCH 20/42] fix: multimer msa parallel

---
 .../helixfold3/helixfold/data/pipeline_parallel.py               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
index 53bb0445..ade48cb5 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
@@ -241,6 +241,7 @@ def process(self, input_fasta_path: str, msa_output_dir: str,other_args: Optiona
         jackhmmer_uniref90_result,
         jackhmmer_mgnify_result,
         bfd_result,
+        other_result
         
     ] = self.parallel_msa_joblib(func=run_msa_tool,
                           input_args=msa_tasks)

From 8548570c8470ca53b6cc438fef3ee0fd496bbd0f Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sun, 18 Aug 2024 14:25:56 +0800
Subject: [PATCH 21/42] drop: buildin logger

---
 .../infer_scripts/feature_processing_aa.py    | 12 +++---
 .../helixfold3/helixfold/inference.py         | 22 +++++------
 .../helixfold3/helixfold/utils/misc.py        | 39 -------------------
 .../helixfold3/helixfold/utils/model.py       |  4 +-
 4 files changed, 19 insertions(+), 58 deletions(-)
 delete mode 100644 apps/protein_folding/helixfold3/helixfold/utils/misc.py

diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
index b89f43f4..3db7640c 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
@@ -2,9 +2,10 @@
 import collections
 import copy
 import os
+from pathlib import Path
 import time, gzip, pickle
 import numpy as np
-import logging
+from absl import logging
 from helixfold.common import residue_constants
 from helixfold.data import parsers
 from helixfold.data import pipeline_multimer
@@ -14,7 +15,6 @@
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from .preprocess import digit2alphabet
 
-logger = logging.getLogger(__file__)
 
 POLYMER_STANDARD_RESI_ATOMS = residue_constants.residue_atoms
 STRING_FEATURES = ['all_chain_ids', 'all_ccd_ids','all_atom_ids', 
@@ -356,7 +356,7 @@ def process_chain_msa(args):
     data_pipeline, chain_id, seq, desc, \
     msa_output_dir, features_pkl = args
     if features_pkl.exists():
-        logger.info('Use cached features.pkl')
+        logging.info('Use cached features.pkl')
         with open(features_pkl, 'rb') as f:
             raw_features = pickle.load(f)
     else:
@@ -450,7 +450,7 @@ def process_input_json(all_entitys, ccd_preprocessed_path,
 
       ## 2. multiprocessing for protein/rna MSA/Template search.
       seqs_to_msa_features = {}
-      logger.info('[Multiprocess] starting MSA/Template search...')
+      logging.info('[Multiprocess] starting MSA/Template search...')
       t0 = time.time()
       with ProcessPoolExecutor() as executor:
           futures = [executor.submit(process_chain_msa, task) for task in tasks]
@@ -461,8 +461,8 @@ def process_input_json(all_entitys, ccd_preprocessed_path,
                   seqs_to_msa_features[seqs] = raw_features
               except Exception as exc:
                   import traceback; traceback.print_exc()
-                  logger.error(f'Task generated an exception : {exc}')
-      logger.info(f'[Multiprocess] All msa/template use: {time.time() - t0}')
+                  logging.error(f'Task generated an exception : {exc}')
+      logging.info(f'[Multiprocess] All msa/template use: {time.time() - t0}')
 
       ## 3. add msa_templ_feats to all_chain_features.
       for type_chain_id in all_chain_features.keys():
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 3e028d1d..7734c964 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -26,6 +26,8 @@
 import numpy as np
 import shutil
 
+from absl import logging
+
 from omegaconf import DictConfig
 import hydra
 
@@ -40,7 +42,6 @@
 from helixfold.data.tools import hmmsearch
 from helixfold.data import templates
 from helixfold.utils.utils import get_custom_amp_list
-from helixfold.utils.misc import set_logging_level
 from typing import Dict
 from helixfold.infer_scripts import feature_processing_aa, preprocess
 from helixfold.infer_scripts.tools import mmcif_writer
@@ -68,7 +69,6 @@
 
 RETURN_KEYS = ['diffusion_module', 'confidence_head']
 
-logger = logging.getLogger(__file__)
 
 MAX_TEMPLATE_HITS = 4
 
@@ -95,7 +95,7 @@ def preprocess_json_entity(json_path, out_dir):
     if all_entitys is None:
         raise ValueError("The json file does not contain any valid entity.")
     else:
-        logger.info("The json file contains %d valid entity.", len(all_entitys))
+        logging.info("The json file contains %d valid entity.", len(all_entitys))
     
     return all_entitys
 
@@ -187,7 +187,7 @@ def ranking_all_predictions(output_dirs):
     ranked_map = dict(sorted(ranking_score_path_map.items(), key=lambda x: x[1], reverse=True))
     rank_id = 1
     for outpath, rank_score in ranked_map.items():
-        logger.debug("[ranking_all_predictions] Ranking score of %s: %.5f", outpath, rank_score)
+        logging.debug("[ranking_all_predictions] Ranking score of %s: %.5f", outpath, rank_score)
         basename_prefix = os.path.basename(outpath).split('-pred-')[0]
         target_path = os.path.join(os.path.dirname(outpath), f'{basename_prefix}-rank{rank_id}')
         if os.path.exists(target_path) and os.path.isdir(target_path):
@@ -216,7 +216,7 @@ def _forward_with_precision(batch):
             raise ValueError("Please choose precision from bf16 and fp32! ")
         
     res = _forward_with_precision(batch)
-    logger.info(f"Inference Succeeds...\n")
+    logging.info(f"Inference Succeeds...\n")
     return res
 
 
@@ -452,7 +452,7 @@ def split_prediction(pred, rank):
 
 @hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold')
 def main(cfg: DictConfig):
-    set_logging_level(cfg.logging_level)
+    logging.set_verbosity(cfg.logging_level)
 
     """main function"""
     new_einsum = os.getenv("FLAGS_new_einsum", True)
@@ -477,7 +477,7 @@ def main(cfg: DictConfig):
     if seed is None:
         seed = np.random.randint(10000000)
     else:
-        logger.warning('Seed is only used for reproduction')
+        logging.warning('Seed is only used for reproduction')
     init_seed(seed)
 
     use_small_bfd = cfg.preset.preset == 'reduced_dbs'
@@ -488,7 +488,7 @@ def main(cfg: DictConfig):
         assert cfg.db.bfd is not None
         assert cfg.db.uniclust30 is not None
 
-    logger.info('Getting MSA/Template Pipelines...')
+    logging.info('Getting MSA/Template Pipelines...')
     msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg)
         
     ### Create model
@@ -541,12 +541,12 @@ def main(cfg: DictConfig):
     if cfg.diff_batch_size > 0:
         model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size
     diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size 
-    logger.info(f'Inference {infer_times} Times...')
-    logger.info(f"Diffusion batch size {diff_batch_size}...\n")
+    logging.info(f'Inference {infer_times} Times...')
+    logging.info(f"Diffusion batch size {diff_batch_size}...\n")
     all_pred_path = []
     for infer_id in range(infer_times):
         
-        logger.info(f'Start {infer_id}-th inference...\n')
+        logging.info(f'Start {infer_id}-th inference...\n')
         prediction = eval(cfg, model, feature_dict)
         
         # save result
diff --git a/apps/protein_folding/helixfold3/helixfold/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py
deleted file mode 100644
index b8faa065..00000000
--- a/apps/protein_folding/helixfold3/helixfold/utils/misc.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# copyright (c) 2024 PaddleHelix Authors. All Rights Reserve.
-#
-# Licensed under Creative Commons Attribution-NonCommercial-ShareAlike 4.0
-# International License (the "License");  you may not use this file  except
-# in compliance with the License. You may obtain a copy of the License at
-#
-#    http://creativecommons.org/licenses/by-nc-sa/4.0/
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-misc utils
-"""
-
-import logging
-
-def set_logging_level(level):
-    level_dict = {
-        "NOTSET": logging.NOTSET,
-        "DEBUG": logging.DEBUG,
-        "INFO": logging.INFO,
-        "WARNING": logging.WARNING,
-        "ERROR": logging.ERROR,
-        "CRITICAL": logging.CRITICAL
-    }
-    logging.basicConfig(
-        format='%(asctime)s %(levelname)s %(message)s',
-        level=level_dict[level],
-        datefmt='%Y-%m-%d %H:%M:%S')
-    for h in logging.root.handlers: 
-        h.setFormatter(
-            logging.Formatter(
-                '%(asctime)s %(levelname)s %(message)s', 
-                datefmt='%Y-%m-%d %H:%M:%S'
-            ))
-    logging.root.setLevel(level_dict[level])
diff --git a/apps/protein_folding/helixfold3/helixfold/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py
index 67b36128..885510bd 100644
--- a/apps/protein_folding/helixfold3/helixfold/utils/model.py
+++ b/apps/protein_folding/helixfold3/helixfold/utils/model.py
@@ -22,7 +22,7 @@
 
 from helixfold.model import modules_all_atom
 from helixfold.model import utils
-logger = logging.getLogger(__name__)
+from absl import logging
 
 class RunModel(nn.Layer):
     """
@@ -69,7 +69,7 @@ def init_params(self, params_path: str):
                 utils.pd_params_merge_qkvw(pd_params)
 
         elif params_path.endswith('.pd') or params_path.endswith('.pdparams'):
-            logger.info('Load as Paddle model')
+            logging.info('Load as Paddle model')
             pd_params = paddle.load(params_path)
 
         else:

From d2feac530ce12b066f67622ad489e8059dad6fdf Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 11:08:11 +0800
Subject: [PATCH 22/42] fix: nested parallel runs for protein chains

---
 .../data/demo_7u7w_protein_nucleic.json       | 19 +++++++++
 .../data/demo_7u7w_protein_nucleic_sm.json    | 24 ++++++++++++
 .../helixfold/data/pipeline_conf_bonds.py     |  6 +--
 .../helixfold/data/pipeline_token_feature.py  | 39 +++++++++++++------
 .../infer_scripts/feature_processing_aa.py    | 39 ++++++++-----------
 5 files changed, 91 insertions(+), 36 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json
 create mode 100644 apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json

diff --git a/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json
new file mode 100644
index 00000000..b846bea4
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic.json
@@ -0,0 +1,19 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "GPHMATGQDRVVALVDMDCFFVQVEQRQNPHLRNKPCAVVQYKSWKGGGIIAVSYEARAFGVTRSMWADDAKKLCPDLLLAQVRESRGKANLTKYREASVEVMEIMSRFAVIERASIDEAYVDLTSAVQERLQKLQGQPISADLLPSTYIEGLPQGPTTAEETVQKEGMRKQGLFQWLDSLQIDNLTSPDLQLTVGAVIVEEMRAAIERETGFQCSAGISHNKVLAKLACGLNKPNRQTLVSHGSVPQLFSQMPIRKIRSLGGKLGASVIEILGIEYMGELTQFTESQLQSHFGEKNGSWLYAMCRGIEHDPVKPRQLPKTIGCSKNFPGKTALATREQVQWWLLQLAQELEERLTKDRNDNDRVATQLVVSIRVQGDKRLSSLRRCCALTRYDAHKMSHDAFTVIKNCNTSGIQTEWSPPLTMLFLCATKFSAS",
+            "count": 1
+        },
+        {
+            "type": "dna",
+            "sequence": "CATTATGACGCT",
+            "count": 1
+        },
+        {
+            "type": "dna",
+            "sequence": "AGCGTCAT",
+            "count": 1
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json
new file mode 100644
index 00000000..77b4dbd1
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_7u7w_protein_nucleic_sm.json
@@ -0,0 +1,24 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "GPHMATGQDRVVALVDMDCFFVQVEQRQNPHLRNKPCAVVQYKSWKGGGIIAVSYEARAFGVTRSMWADDAKKLCPDLLLAQVRESRGKANLTKYREASVEVMEIMSRFAVIERASIDEAYVDLTSAVQERLQKLQGQPISADLLPSTYIEGLPQGPTTAEETVQKEGMRKQGLFQWLDSLQIDNLTSPDLQLTVGAVIVEEMRAAIERETGFQCSAGISHNKVLAKLACGLNKPNRQTLVSHGSVPQLFSQMPIRKIRSLGGKLGASVIEILGIEYMGELTQFTESQLQSHFGEKNGSWLYAMCRGIEHDPVKPRQLPKTIGCSKNFPGKTALATREQVQWWLLQLAQELEERLTKDRNDNDRVATQLVVSIRVQGDKRLSSLRRCCALTRYDAHKMSHDAFTVIKNCNTSGIQTEWSPPLTMLFLCATKFSAS",
+            "count": 1
+        },
+        {
+            "type": "dna",
+            "sequence": "CATTATGACGCT",
+            "count": 1
+        },
+        {
+            "type": "dna",
+            "sequence": "AGCGTCAT",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "XG4",
+            "count": 1
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index dee10bb0..da80d9af 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -107,9 +107,9 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
     features[k] = np.concatenate(v, axis=0)
   features['ref_atom_count'] = np.bincount(features['ref_token2atom_idx'])
 
-  assert np.max(features['ref_element']) < 128
-  assert np.max(features['ref_atom_name_chars']) < 64
-  assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features.
+  assert np.max(features['ref_element']) < 128 # WTF?
+  assert np.max(features['ref_atom_name_chars']) < 64 # WTF?
+  assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. # WTF?
   return features
 
 
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py
index 7bbb1805..6c16a2c4 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py
@@ -3,16 +3,17 @@
 import collections
 import os
 import time
-from typing import MutableMapping, Optional, List
+from typing import Any, MutableMapping, Optional, List
 from absl import logging
 from helixfold.common import residue_constants
 from helixfold.data import parsers
+from helixfold.data.tools import utils
 import numpy as np
 import json
 import gzip
 import pickle
 from rdkit import Chem
- 
+
 FeatureDict = MutableMapping[str, np.ndarray]
 ELEMENT_MAPPING = Chem.GetPeriodicTable()
  
@@ -56,6 +57,15 @@ def flatten_is_protein_features(is_protein_feats: np.ndarray) -> FeatureDict:
  
   return res
  
+
+def dump_all_ccd_keys(ccd_data: dict[str, Any]):
+  ccd_keys_file='all_ccd_keys.txt'
+  if not os.path.isfile(ccd_keys_file):
+      open(ccd_keys_file, 'w').write('\n'.join(ccd_data.keys()))
+  logging.warning(f'All ccd keys are dumped to {ccd_keys_file}')
+  return ccd_keys_file
+  
+   
 def make_sequence_features(
     all_chain_info, ccd_preprocessed_dict, 
     extra_feats: Optional[dict]=None) -> FeatureDict:
@@ -106,13 +116,18 @@ def make_sequence_features(
     sym_id = chainid_to_sym_id[_alphabet_chain_id]
     for residue_id, ccd_id in enumerate(ccd_seq):
       if ccd_id not in ccd_preprocessed_dict:
-        assert not extra_feats is None and ccd_id in extra_feats,\
-                  f'<{ccd_id}> not in ccd_preprocessed_dict, But got extra_feats is None'
+        if extra_feats is None:
+          ccd_kf=dump_all_ccd_keys(ccd_preprocessed_dict)
+          raise ValueError(f'<{ccd_id}> not in ccd_preprocessed_dict, But got extra_feats is None. See all keys in {ccd_kf}')
+        if ccd_id not in extra_feats:
+          ccd_kf=dump_all_ccd_keys(ccd_preprocessed_dict)
+          raise ValueError(f'<{ccd_id}> not in ccd_preprocessed_dict or extra_feats.  See all keys in {ccd_kf}')
         _ccd_feats = extra_feats[ccd_id]
       else:
         _ccd_feats = ccd_preprocessed_dict[ccd_id]
       num_atoms = len(_ccd_feats['position'])
-      assert num_atoms > 0, f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.'
+      if num_atoms == 0:
+         raise NotImplementedError(f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.')
       
       if ccd_id not in residue_constants.STANDARD_LIST: 
           features['asym_id'].append(np.array([chain_num_id] * num_atoms, dtype=np.int32))
@@ -198,12 +213,14 @@ def process(self,
 
     if ccd_preprocessed_dict is None:
       ccd_preprocessed_dict = {}
-      st_1 = time.time()
-      if 'pkl.gz' in self.ccd_preprocessed_path:
-          with gzip.open(self.ccd_preprocessed_path, "rb") as fp:
-              ccd_preprocessed_dict = pickle.load(fp)
-      logging.info(f'load ccd dataset done. use {time.time()-st_1}s')
- 
+
+    if not 'pkl.gz' in self.ccd_preprocessed_path:
+      raise ValueError(f'Invalid ccd_preprocessed_path: {self.ccd_preprocessed_path}')
+
+    with utils.timing('Loading CCD dataset'):
+      with gzip.open(self.ccd_preprocessed_path, "rb") as fp:
+          ccd_preprocessed_dict = pickle.load(fp)
+
     if select_mmcif_chainID is not None:
       select_mmcif_chainID = set(select_mmcif_chainID)
  
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
index 3db7640c..cbec3867 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
@@ -4,15 +4,19 @@
 import os
 from pathlib import Path
 import time, gzip, pickle
+from typing import Optional, Tuple
 import numpy as np
 from absl import logging
+
+
 from helixfold.common import residue_constants
 from helixfold.data import parsers
-from helixfold.data import pipeline_multimer
+from helixfold.data.tools import utils
+from helixfold.data import pipeline_multimer, pipeline_multimer_parallel
 from helixfold.data import pipeline_rna_multimer
 from helixfold.data import pipeline_conf_bonds, pipeline_token_feature, pipeline_hybrid
 from helixfold.data import label_utils
-from concurrent.futures import ProcessPoolExecutor, as_completed
+
 from .preprocess import digit2alphabet
 
 
@@ -330,7 +334,7 @@ def add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_temp
           "label": label,}
 
 
-def process_chain_msa(args):
+def process_chain_msa(args: tuple[pipeline_multimer_parallel.DataPipeline, str, Optional[str],Optional[str], os.PathLike,os.PathLike ]) -> Tuple[str,dict, str, str]:
     """
     处理链，如果缓存了特征文件，则直接使用缓存的特征文件，否则生成新的特征文件。
     
@@ -360,12 +364,12 @@ def process_chain_msa(args):
         with open(features_pkl, 'rb') as f:
             raw_features = pickle.load(f)
     else:
-        t0 = time.time()
-        raw_features = data_pipeline._process_single_chain(
-            chain_id, sequence=seq, description=desc,
-            msa_output_dir=msa_output_dir,
-            is_homomer_or_monomer=False)
-        print(f'[MSA/Template] {desc}; seq length: {len(seq)}; use: {time.time() - t0}')
+        with utils.timing(f'[MSA/Template]({desc}) with seq length: {len(seq)}'):
+          raw_features = data_pipeline._process_single_chain(
+              chain_id, sequence=seq, description=desc,
+              msa_output_dir=msa_output_dir,
+              is_homomer_or_monomer=False)
+       
 
         with open(features_pkl, 'wb') as f:
             pickle.dump(raw_features, f, protocol=4)
@@ -450,19 +454,10 @@ def process_input_json(all_entitys, ccd_preprocessed_path,
 
       ## 2. multiprocessing for protein/rna MSA/Template search.
       seqs_to_msa_features = {}
-      logging.info('[Multiprocess] starting MSA/Template search...')
-      t0 = time.time()
-      with ProcessPoolExecutor() as executor:
-          futures = [executor.submit(process_chain_msa, task) for task in tasks]
-
-          for future in as_completed(futures):
-              try:
-                  _, raw_features, type_chain_id, seqs = future.result()
-                  seqs_to_msa_features[seqs] = raw_features
-              except Exception as exc:
-                  import traceback; traceback.print_exc()
-                  logging.error(f'Task generated an exception : {exc}')
-      logging.info(f'[Multiprocess] All msa/template use: {time.time() - t0}')
+      with utils.timing('MSA/Template search'):
+        for task in tasks:
+          _, raw_features, type_chain_id, seqs=process_chain_msa(task)
+          seqs_to_msa_features[seqs] = raw_features
 
       ## 3. add msa_templ_feats to all_chain_features.
       for type_chain_id in all_chain_features.keys():

From b3153de46b53518259e0335a7182aa121916c144 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 16:37:10 +0800
Subject: [PATCH 23/42] refactor: deduplicate code

---
 .../helixfold/data/pipeline_conf_bonds.py     | 39 +++++++++++++++++--
 .../helixfold/data/pipeline_token_feature.py  | 14 ++-----
 .../infer_scripts/feature_processing_aa.py    | 23 +++--------
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index da80d9af..0da4a980 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -1,10 +1,19 @@
 """Functions for building the input features (reference ccd features) for the HelixFold model."""
 
 import collections
-from typing import Optional
+import gzip
+import os
+import pickle
+from typing import Any, Optional
+
+from absl import logging
+from immutabledict import immutabledict
 from helixfold.common import residue_constants
 import numpy as np
 
+from helixfold.data.tools import utils
+
+
 ALLOWED_LIGAND_BONDS_TYPE = {
     "SING": 1,
     "DOUB": 2,
@@ -13,6 +22,25 @@
     "AROM": 12,
 }
 
+def load_ccd_dict(ccd_preprocessed_path: str) -> immutabledict[str, Any]:
+    if not os.path.exists(ccd_preprocessed_path):
+      raise FileNotFoundError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.')
+    
+    if not ccd_preprocessed_path.endswith('.pkl.gz') and not ccd_preprocessed_path.endswith('.pkl'):
+        raise ValueError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not endswith .pkl.gz and .pkl')
+  
+    with utils.timing(f'Loading CCD dataset from {ccd_preprocessed_path}'):
+      if ccd_preprocessed_path.endswith('.pkl.gz'):
+          with gzip.open(ccd_preprocessed_path, "rb") as fp:
+              ccd_preprocessed_dict = immutabledict(pickle.load(fp))
+      else:
+          with open(ccd_preprocessed_path, "rb") as fp:
+              ccd_preprocessed_dict = immutabledict(pickle.load(fp))
+    
+    logging.info(f'CCD dataset contains {len(ccd_preprocessed_dict)} entries.')
+    
+    return ccd_preprocessed_dict
+
 def element_map_with_x(atom_symbol):
   # ## one-hot max shape == 128
   return residue_constants.ATOM_ELEMENT.get(atom_symbol, 127)
@@ -107,8 +135,13 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
     features[k] = np.concatenate(v, axis=0)
   features['ref_atom_count'] = np.bincount(features['ref_token2atom_idx'])
 
-  assert np.max(features['ref_element']) < 128 # WTF?
-  assert np.max(features['ref_atom_name_chars']) < 64 # WTF?
+  if (_ref_element:=np.max(features['ref_element'])) >= 128:
+     raise ValueError(f'ref_element= {_ref_element}, which is larger then 128.\n{features["ref_element"]}\n{"-"*79}')
+
+  
+  if (_ref_atom_name_chars:=np.max(features['ref_atom_name_chars'])) >= 64:
+     raise ValueError(f'ref_atom_name_chars= {_ref_atom_name_chars}, which is larger then 64.\n{features["ref_atom_name_chars"]}\n{"-"*79}')
+
   assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. # WTF?
   return features
 
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py
index 6c16a2c4..92ea0e1e 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_token_feature.py
@@ -7,11 +7,10 @@
 from absl import logging
 from helixfold.common import residue_constants
 from helixfold.data import parsers
-from helixfold.data.tools import utils
+from helixfold.data.pipeline_conf_bonds import load_ccd_dict
 import numpy as np
 import json
-import gzip
-import pickle
+
 from rdkit import Chem
 
 FeatureDict = MutableMapping[str, np.ndarray]
@@ -212,14 +211,7 @@ def process(self,
       assembly_dict = unit_dict
 
     if ccd_preprocessed_dict is None:
-      ccd_preprocessed_dict = {}
-
-    if not 'pkl.gz' in self.ccd_preprocessed_path:
-      raise ValueError(f'Invalid ccd_preprocessed_path: {self.ccd_preprocessed_path}')
-
-    with utils.timing('Loading CCD dataset'):
-      with gzip.open(self.ccd_preprocessed_path, "rb") as fp:
-          ccd_preprocessed_dict = pickle.load(fp)
+      ccd_preprocessed_dict=load_ccd_dict(self.ccd_preprocessed_path)
 
     if select_mmcif_chainID is not None:
       select_mmcif_chainID = set(select_mmcif_chainID)
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
index cbec3867..fd875159 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
@@ -3,8 +3,9 @@
 import copy
 import os
 from pathlib import Path
-import time, gzip, pickle
-from typing import Optional, Tuple
+import pickle
+from typing import Optional, Tuple, Any
+
 import numpy as np
 from absl import logging
 
@@ -17,6 +18,8 @@
 from helixfold.data import pipeline_conf_bonds, pipeline_token_feature, pipeline_hybrid
 from helixfold.data import label_utils
 
+from helixfold.data.tools import utils
+
 from .preprocess import digit2alphabet
 
 
@@ -24,20 +27,6 @@
 STRING_FEATURES = ['all_chain_ids', 'all_ccd_ids','all_atom_ids', 
                   'release_date','label_ccd_ids','label_atom_ids']
 
-def load_ccd_dict(ccd_preprocessed_path):
-    assert os.path.exists(ccd_preprocessed_path),\
-              (f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.')
-    st_1 = time.time()
-    if 'pkl.gz' in ccd_preprocessed_path:
-        with gzip.open(ccd_preprocessed_path, "rb") as fp:
-            ccd_preprocessed_dict = pickle.load(fp)
-    elif '.pkl' in ccd_preprocessed_path:
-        with open(ccd_preprocessed_path, "rb") as fp:
-            ccd_preprocessed_dict = pickle.load(fp)
-    print(f'[CCD] load ccd dataset done. use {time.time()-st_1}s;'\
-                    f'Has length of {len(ccd_preprocessed_dict)}')
-    
-    return ccd_preprocessed_dict
 
 
 def crop_msa(feat, max_msa_depth=16384):
@@ -385,7 +374,7 @@ def process_input_json(all_entitys, ccd_preprocessed_path,
                           no_msa_templ_feats=False):
 
     ## load ccd dict.
-    ccd_preprocessed_dict = load_ccd_dict(ccd_preprocessed_path)
+    ccd_preprocessed_dict = pipeline_conf_bonds.load_ccd_dict(ccd_preprocessed_path)
     all_chain_features = {}
     sequence_features = {} 
     num_chains = 0

From cd68f004e4a7345363e15f073f64974638ea217f Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 19:11:03 +0800
Subject: [PATCH 24/42] fix: hetatm input raise and dialognoses

---
 .../helixfold/data/pipeline_conf_bonds.py         | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index 0da4a980..0973f48c 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -107,6 +107,13 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
       features['ref_element'].append(np.array([element_map_with_x(t[0].upper() + t[1:].lower())
                                               for t in _ccd_feats['atom_symbol']], dtype=np.int32))
       features['ref_charge'].append(np.array(_ccd_feats['charge'], dtype=np.int32))
+
+      # atom checks
+      for atom_id in _ccd_feats['atom_ids']:
+        converted_atom_id_name=convert_atom_id_name(atom_id)
+        if max(converted_atom_id_name)>= 64:
+          raise ValueError(f'>>> Problematic atom in ligand ({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}')
+      
       features['ref_atom_name_chars'].append(
                               np.array([convert_atom_id_name(atom_id) for atom_id in _ccd_feats['atom_ids']]
                                                                 , dtype=np.int32))
@@ -135,12 +142,8 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
     features[k] = np.concatenate(v, axis=0)
   features['ref_atom_count'] = np.bincount(features['ref_token2atom_idx'])
 
-  if (_ref_element:=np.max(features['ref_element'])) >= 128:
-     raise ValueError(f'ref_element= {_ref_element}, which is larger then 128.\n{features["ref_element"]}\n{"-"*79}')
-
-  
-  if (_ref_atom_name_chars:=np.max(features['ref_atom_name_chars'])) >= 64:
-     raise ValueError(f'ref_atom_name_chars= {_ref_atom_name_chars}, which is larger then 64.\n{features["ref_atom_name_chars"]}\n{"-"*79}')
+  if (len_ref_element:=np.max(features['ref_element'])) >= 128:
+     raise ValueError(f'{len_ref_element=}, which is larger then 128.\n{features["ref_element"]}\n{"-"*79}')
 
   assert len(set([len(v) for k, v in features.items() if k != 'ref_atom_count'])) == 1 ## To check same Atom-level features. # WTF?
   return features

From 7180fe7fedea5eea174081d7b210365f558c2d7b Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 20:02:23 +0800
Subject: [PATCH 25/42] fix: hetatm input raise from smiles

---
 .../helixfold3/helixfold/data/pipeline_conf_bonds.py   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index 0973f48c..ddba9822 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -45,13 +45,14 @@ def element_map_with_x(atom_symbol):
   # ## one-hot max shape == 128
   return residue_constants.ATOM_ELEMENT.get(atom_symbol, 127)
 
-def convert_atom_id_name(atom_id: str) -> int:
+def convert_atom_id_name(atom_id: str) -> list[int]:
   """
     Converts unique atom_id names to integer of atom_name. need to be padded to length 4.
     Each character is encoded as ord(c) − 32
   """
+  if (len_atom_id:=len(atom_id))>4:
+     raise ValueError(f'atom_id: `{atom_id}` is too long, max length is 4.')
   atom_id_pad = atom_id.ljust(4, ' ')
-  assert len(atom_id_pad) == 4
   return [ord(c) - 32 for c in atom_id_pad]
 
 
@@ -110,12 +111,13 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
 
       # atom checks
       for atom_id in _ccd_feats['atom_ids']:
-        converted_atom_id_name=convert_atom_id_name(atom_id)
+        converted_atom_id_name=convert_atom_id_name(atom_id.upper())
         if max(converted_atom_id_name)>= 64:
           raise ValueError(f'>>> Problematic atom in ligand ({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}')
+        logging.debug(f'({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}')
       
       features['ref_atom_name_chars'].append(
-                              np.array([convert_atom_id_name(atom_id) for atom_id in _ccd_feats['atom_ids']]
+                              np.array([convert_atom_id_name(atom_id.upper()) for atom_id in _ccd_feats['atom_ids']]
                                                                 , dtype=np.int32))
       
       # here we get ref_space_uid [ Each (chain id, residue index) tuple is assigned an integer on first appearance.]

From 330e4551c5485b9bf2d97abdc0fa78f9eda08d6d Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 22:05:07 +0800
Subject: [PATCH 26/42] feat: ligand: support obable readable files

---
 .../helixfold3/data/demo_p450_heme.json       | 19 +++++
 .../helixfold3/data/demo_p450_heme_sdf.json   | 19 +++++
 .../data/demo_p450_heme_smiles.json           | 19 +++++
 .../helixfold/infer_scripts/preprocess.py     | 85 ++++++++++++++-----
 .../helixfold3/helixfold/inference.py         | 11 ++-
 5 files changed, 124 insertions(+), 29 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme.json
 create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json
 create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json

diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme.json b/apps/protein_folding/helixfold3/data/demo_p450_heme.json
new file mode 100644
index 00000000..a414ef58
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_p450_heme.json
@@ -0,0 +1,19 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "HEM",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
+            "count": 1
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json
new file mode 100644
index 00000000..72500c6e
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json
@@ -0,0 +1,19 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "HEM",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "sdf": "/mnt/data/yinying/tests/helixfold/ligands/60119277-3d.sdf",
+            "count": 1
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json
new file mode 100644
index 00000000..cb6a22f5
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json
@@ -0,0 +1,19 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)O)CCC(=O)O.[Fe+2]",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
+            "count": 1
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index eb8eb14f..f668f1bd 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -16,6 +16,8 @@
 import subprocess
 import tempfile
 import itertools
+from absl import logging
+from typing import Tuple, Union, Mapping, Literal, Callable, Any
 import rdkit
 from rdkit import Chem
 from rdkit.Chem import AllChem
@@ -138,26 +140,54 @@ def smiles_to_ETKDGMol(smiles):
     return optimal_mol_wo_H
 
 
-def smiles_toMol_obabel(smiles):
-    """
-        generate mol from smiles using obabel;
-    """    
-    
-    OBABEL_BIN = os.getenv('OBABEL_BIN')
-    if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)):
-        raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')
-    
-    with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
-        print(f"[OBABEL] Temporary file created: {temp_file.name}")
-        obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d"
+class Mol2MolObabel:
+    def __init__(self):
+        self.obabel_bin = os.getenv('OBABEL_BIN')
+        if not (self.obabel_bin and os.path.isfile(self.obabel_bin)):
+            raise FileNotFoundError(f'Cannot find obabel binary at {self.obabel_bin}.')
+        
+        # Get the supported formats
+        self.supported_formats: Tuple[str] = self._get_supported_formats()
+
+    def _get_supported_formats(self) -> Tuple[str]:
+        """
+        Retrieves the list of supported formats from obabel and filters out write-only formats.
+        
+        Returns:
+            tuple: A tuple of supported input formats.
+        """
+        obabel_cmd = f"{self.obabel_bin} -L formats"
         ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
-        mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
-        if '3D coordinate generation failed' in ret.stderr:
-            mol = generate_ETKDGv3_conformer(mol)
-        optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
-    return optimal_mol_wo_H
+        formats = [line.split()[0] for line in ret.stdout.splitlines() if '[Write-only]' not in line]
+        formats.append('smiles')
+        
+        return tuple(formats)
 
+    def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol:
+        with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
+            print(f"[OBABEL] Temporary file created: {temp_file.name}")
+            if input_type == 'smiles':
+                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
+            else:
+                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
+            ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
+            mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
+            if '3D coordinate generation failed' in ret.stderr:
+                mol = generate_ETKDGv3_conformer(mol)
+            optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
+            logging.debug(f'Converted `{input_type}`: {input_value}')
+            return optimal_mol_wo_H
+
+    def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol:
+        if input_type not in self.supported_formats:
+            raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n')
+
+        if input_type != 'smiles' and not os.path.isfile(input_value):
+            raise FileNotFoundError(f'Cannot find the {input_type.upper()} file at {input_value}.')
+        
+        return self._perform_conversion(input_type, input_value)
 
+    __call__: Callable[[str, str], Chem.Mol] = _convert_to_mol
 def polymer_convert(items):
     """
         "type": "protein",                          
@@ -192,7 +222,7 @@ def polymer_convert(items):
     }
 
 
-def ligand_convert(items):
+def ligand_convert(items: Mapping[str, Union[int, str]]):
     """
         "type": "ligand",
         "ccd": "ATP", or "smiles": "CCccc(O)ccc",
@@ -200,22 +230,31 @@ def ligand_convert(items):
     """
     dtype = items['type']
     count = items['count']
+    converter=Mol2MolObabel()
     
     msa_seqs = ""
     _ccd_seqs = []
     ccd_to_extra_mol_infos = {}
     if 'ccd' in items:
         _ccd_seqs.append(f"({items['ccd']})")
-    elif 'smiles' in items:
-        _ccd_seqs.append(f"(UNK-)")
+
+    
+    elif any(f in items for f in converter.supported_formats):
+        for k in converter.supported_formats:
+            if k in items:
+                break
+
+        ligand_name="UNK-"
+        _ccd_seqs.append(f"({ligand_name})")
         # mol_wo_h = smiles_to_ETKDGMol(items['smiles'])
-        mol_wo_h = smiles_toMol_obabel(items['smiles'])
+        
+        mol_wo_h = converter(k, items[k])
         _extra_mol_infos = make_basic_info_fromMol(mol_wo_h)
         ccd_to_extra_mol_infos = {
-            "UNK-": _extra_mol_infos
+            ligand_name: _extra_mol_infos
         }
     else:
-        raise ValueError(f'not support for the {dtype} in ligand_convert')
+        raise ValueError(f'not support for the {dtype} in ligand_convert, please check the input. \nSupported input: {converter.supported_formats}')
     ccd_seqs = ''.join(_ccd_seqs) ## (GLY)(ALA).....
 
     # repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs]
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 7734c964..76f40d96 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -102,16 +102,15 @@ def preprocess_json_entity(json_path, out_dir):
 def convert_to_json_compatible(obj):
     if isinstance(obj, np.ndarray):
         return obj.tolist()
-    elif isinstance(obj, np.integer):
+    if isinstance(obj, np.integer):
         return int(obj)
-    elif isinstance(obj, np.floating):
+    if isinstance(obj, np.floating):
         return float(obj)
-    elif isinstance(obj, dict):
+    if isinstance(obj, dict):
         return {k: convert_to_json_compatible(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
+    if isinstance(obj, list):
         return [convert_to_json_compatible(i) for i in obj]
-    else:
-        return obj
+    return obj
 
 def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str:
     """Helper function to resolve the binary path."""

From dce79f8e787c83bfd379d65efa12b7204ad980e0 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 22:23:49 +0800
Subject: [PATCH 27/42] fix: output logs

---
 .../helixfold3/data/demo_3fap_protein_sm.json | 19 ++++++++++++++
 .../helixfold/infer_scripts/preprocess.py     | 26 ++++++++++---------
 2 files changed, 33 insertions(+), 12 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json

diff --git a/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json
new file mode 100644
index 00000000..6b2f9418
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json
@@ -0,0 +1,19 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE",
+            "count": 1
+        },
+        {
+            "type": "protein",
+            "sequence": "VAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMERGPQTLKETSFNQAYGRDLMEAQEWCRKYMKSGNVKDLTQAWDLYYHVFRRIS",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "Cc1ccc(s1)[C@@H]\\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\\C)C)C)OC)O)\\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C",
+            "count": 1
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index f668f1bd..6314fccd 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -22,6 +22,7 @@
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from helixfold.common import residue_constants
+from helixfold.data.tools import utils
 
 
 ## NOTE: this mapping is only useful for standard dna/rna/protein sequence input.
@@ -165,18 +166,19 @@ def _get_supported_formats(self) -> Tuple[str]:
 
     def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol:
         with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
-            print(f"[OBABEL] Temporary file created: {temp_file.name}")
-            if input_type == 'smiles':
-                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
-            else:
-                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
-            ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
-            mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
-            if '3D coordinate generation failed' in ret.stderr:
-                mol = generate_ETKDGv3_conformer(mol)
-            optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
-            logging.debug(f'Converted `{input_type}`: {input_value}')
-            return optimal_mol_wo_H
+            with utils.timing(f'converting {input_type} to mol2: {input_value}'):
+                if input_type == 'smiles':
+                    obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
+                    if len(input_value)>60:
+                        logging.warning(f'This takes a while ...')
+                else:
+                    obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
+                ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
+                mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
+                if '3D coordinate generation failed' in ret.stderr:
+                    mol = generate_ETKDGv3_conformer(mol)
+                optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
+                return optimal_mol_wo_H
 
     def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol:
         if input_type not in self.supported_formats:

From 6968f690b223f40eeb681dcc5aa5227d4f145fa5 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:04:45 +0800
Subject: [PATCH 28/42] fix: template try-except bugs

---
 .../helixfold3/helixfold/data/pipeline_conf_bonds.py |  2 +-
 .../helixfold3/helixfold/data/templates.py           | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index ddba9822..c8947643 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -114,7 +114,7 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
         converted_atom_id_name=convert_atom_id_name(atom_id.upper())
         if max(converted_atom_id_name)>= 64:
           raise ValueError(f'>>> Problematic atom in ligand ({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}')
-        logging.debug(f'({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}')
+        # logging.debug(f'({residue_id=}, {ccd_id=}, {chain_id=}) {atom_id=}, {converted_atom_id_name=}')
       
       features['ref_atom_name_chars'].append(
                               np.array([convert_atom_id_name(atom_id.upper()) for atom_id in _ccd_feats['atom_ids']]
diff --git a/apps/protein_folding/helixfold3/helixfold/data/templates.py b/apps/protein_folding/helixfold3/helixfold/data/templates.py
index f2e3289a..f9efdae0 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/templates.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/templates.py
@@ -817,19 +817,15 @@ def _process_single_hit(
           TemplateAtomMaskAllZerosError) as e:
     # These 3 errors indicate missing mmCIF experimental data rather than a
     # problem with the template search, so turn them into warnings.
-    warning = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
-               '%s, mmCIF parsing errors: %s'
-               % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
-                  str(e), parsing_result.errors))
+    warning = (f'{hit_pdb_code}_{hit_chain_id} (sum_probs: {hit.sum_probs}, rank: {hit.index}): feature extracting errors: '
+               f'{str(e)}, mmCIF parsing errors: {parsing_result.errors}')
     if strict_error_check:
       return SingleHitResult(features=None, error=warning, warning=None)
     else:
       return SingleHitResult(features=None, error=None, warning=warning)
   except Error as e:
-    error = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
-             '%s, mmCIF parsing errors: %s'
-             % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
-                str(e), parsing_result.errors))
+    error = (f'{hit_pdb_code}_{hit_chain_id} (sum_probs: {hit.sum_probs}, rank: {hit.index}): feature extracting errors: '
+               f'{str(e)}, mmCIF parsing errors: {parsing_result.errors}')
     return SingleHitResult(features=None, error=error, warning=None)
 
 
From 90f731a5900bfcfe5432bbe47f0265970ef9ed29 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:10:17 +0800
Subject: [PATCH 29/42] docs: sm file inputs

---
 apps/protein_folding/helixfold3/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index c8fc9c8c..639a46be 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -105,8 +105,8 @@ The script `scripts/download_all_data.sh` can be used to download and set up all
 
 There are some demo input under `./data/` for your test and reference. Data input is in the form of JSON containing
 several entities such as `protein`, `ligand`, `nucleic acids`, and `iron`. Proteins and nucleic acids inputs are their sequence.
-HelixFold3 supports input ligand as SMILES or CCD id, please refer to `/data/demo_6zcy_smiles.json` and `demo_output/demo_6zcy_smiles/` 
-for more details about SMILES input. More flexible input will come in soon.
+HelixFold3 supports input ligand as SMILES, CCD id or small molecule files, please refer to `/data/demo_6zcy_smiles.json` and `data/demo_p450_heme_sdf.json` 
+for more details about SMILES input. Flexible input from small molecule is now supported. See `obabel -L formats |grep -v 'Write-only'`
 
 A example of input data is as follows:
 ```json

From 2bd81ad8d5a5d0aa4c23e3f4f9cfa78c249cc337 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Tue, 20 Aug 2024 08:48:18 +0800
Subject: [PATCH 30/42] drop: buildin logger

---
 .../helixfold/data/mmcif_parsing_paddle.py    |  2 +-
 .../helixfold3/helixfold/data/tools/utils.py  |  2 +-
 .../helixfold/infer_scripts/preprocess.py     | 29 ++++++++++---------
 .../helixfold3/helixfold/inference.py         |  1 -
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py
index d5e8f875..2af97446 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py
@@ -22,7 +22,7 @@
 from Bio import PDB
 import numpy as np
 import time
-import logging
+from absl import logging
 from helixfold.common.residue_constants import crystallization_aids, ligand_exclusion_list
 
 # Type aliases:
diff --git a/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py b/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py
index c415dac8..c588187d 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/tools/utils.py
@@ -16,7 +16,7 @@
 
 import contextlib
 import shutil
-import logging
+from absl import logging
 import tempfile
 import time
 from typing import Optional
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index 6314fccd..53a0e588 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -165,20 +165,21 @@ def _get_supported_formats(self) -> Tuple[str]:
         return tuple(formats)
 
     def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol:
-        with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
-            with utils.timing(f'converting {input_type} to mol2: {input_value}'):
-                if input_type == 'smiles':
-                    obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
-                    if len(input_value)>60:
-                        logging.warning(f'This takes a while ...')
-                else:
-                    obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
-                ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
-                mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
-                if '3D coordinate generation failed' in ret.stderr:
-                    mol = generate_ETKDGv3_conformer(mol)
-                optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
-                return optimal_mol_wo_H
+        with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'):
+            if input_type == 'smiles':
+                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
+                if len(input_value)>60:
+                    logging.warning(f'This takes a while ...')
+            else:
+                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
+            logging.debug(f'Launching command: `{obabel_cmd}`')
+            ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
+            mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
+            if '3D coordinate generation failed' in ret.stderr:
+                mol = generate_ETKDGv3_conformer(mol)
+            optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
+
+            return optimal_mol_wo_H
 
     def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol:
         if input_type not in self.supported_formats:
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 76f40d96..e18029a5 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -22,7 +22,6 @@
 import pickle
 import pathlib
 import shutil
-import logging
 import numpy as np
 import shutil
 

From 07203113071ceb17a4f02d3cceee682d610a672b Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Tue, 20 Aug 2024 08:54:05 +0800
Subject: [PATCH 31/42] drop: buildin logger

---
 .../helixfold3/helixfold/data/mmcif_parsing_paddle.py            | 1 -
 apps/protein_folding/helixfold3/helixfold/utils/model.py         | 1 -
 2 files changed, 2 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py
index 2af97446..d6f907eb 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/mmcif_parsing_paddle.py
@@ -34,7 +34,6 @@
 SeqRes = str
 MmCIFDict = Mapping[str, Sequence[str]]
 
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 @dataclasses.dataclass(frozen=True)
 class Monomer:
diff --git a/apps/protein_folding/helixfold3/helixfold/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py
index 885510bd..1fcc53d4 100644
--- a/apps/protein_folding/helixfold3/helixfold/utils/model.py
+++ b/apps/protein_folding/helixfold3/helixfold/utils/model.py
@@ -17,7 +17,6 @@
 import numpy as np
 import paddle
 import paddle.nn as nn
-import logging
 import io
 
 from helixfold.model import modules_all_atom

From d7bd9eeee7ea95ab9c57248853c6b214a2bd9909 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Tue, 20 Aug 2024 09:18:53 +0800
Subject: [PATCH 32/42] add: use_3d opt for ligand

---
 .../helixfold3/data/demo_3fap_protein_sm.json      |  3 ++-
 .../helixfold/infer_scripts/preprocess.py          | 14 +++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json
index 6b2f9418..caf27f02 100644
--- a/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json
+++ b/apps/protein_folding/helixfold3/data/demo_3fap_protein_sm.json
@@ -12,7 +12,8 @@
         },
         {
             "type": "ligand",
-            "smiles": "Cc1ccc(s1)[C@@H]\\2C[C@@H]3CC[C@H]([C@@](O3)(C(=O)C(=O)N4CCCC[C@H]4C(=O)O[C@@H](CC(=O)[C@@H](/C=C(/[C@H]([C@H](C(=O)[C@@H](C[C@@H](/C=C/C=C/C=C2\\C)C)C)OC)O)\\C)C)[C@H](C)C[C@@H]5CC[C@H]([C@@H](C5)OC)O)O)C",
+            "sdf": "/mnt/data/yinying/tests/helixfold/ligands/ARD_ideal.sdf",
+            "use_3d": false,
             "count": 1
         }
     ]
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index 53a0e588..fa30f96a 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -164,14 +164,14 @@ def _get_supported_formats(self) -> Tuple[str]:
         
         return tuple(formats)
 
-    def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol:
+    def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol:
         with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'):
             if input_type == 'smiles':
-                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
+                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
                 if len(input_value)>60:
                     logging.warning(f'This takes a while ...')
             else:
-                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
+                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
             logging.debug(f'Launching command: `{obabel_cmd}`')
             ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
             mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
@@ -181,16 +181,16 @@ def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol:
 
             return optimal_mol_wo_H
 
-    def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol:
+    def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol:
         if input_type not in self.supported_formats:
             raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n')
 
         if input_type != 'smiles' and not os.path.isfile(input_value):
             raise FileNotFoundError(f'Cannot find the {input_type.upper()} file at {input_value}.')
         
-        return self._perform_conversion(input_type, input_value)
+        return self._perform_conversion(input_type, input_value, generate_3d)
 
-    __call__: Callable[[str, str], Chem.Mol] = _convert_to_mol
+    __call__: Callable[[str, str, bool], Chem.Mol] = _convert_to_mol
 def polymer_convert(items):
     """
         "type": "protein",                          
@@ -251,7 +251,7 @@ def ligand_convert(items: Mapping[str, Union[int, str]]):
         _ccd_seqs.append(f"({ligand_name})")
         # mol_wo_h = smiles_to_ETKDGMol(items['smiles'])
         
-        mol_wo_h = converter(k, items[k])
+        mol_wo_h = converter(k, items[k], items.get('use_3d', True))
         _extra_mol_infos = make_basic_info_fromMol(mol_wo_h)
         ccd_to_extra_mol_infos = {
             ligand_name: _extra_mol_infos

From 96a3de830649536a5cf4efafad0341972e4230ca Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Tue, 20 Aug 2024 17:07:17 +0800
Subject: [PATCH 33/42] fix: enable mutiple config overrides

---
 .../helixfold3/helixfold/config/helixfold.yaml        |  2 +-
 .../helixfold3/helixfold/model/config.py              | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index bc163019..6b40ea23 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -60,7 +60,7 @@ preset:
 
 # Other configurations
 other:
-  maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
+  maxit_binary: /mnt/data/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
 
 
 # CONFIG_DIFFS for advanced configuration
diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py
index f6d2fe97..f9dbbf1d 100644
--- a/apps/protein_folding/helixfold3/helixfold/model/config.py
+++ b/apps/protein_folding/helixfold3/helixfold/model/config.py
@@ -39,14 +39,17 @@ def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]
       cfg.merge_with_dotlist(updated_config)
       print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}')
 
-      return cfg
     
     # update from detailed configuration
     if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM):
 
-      cfg.merge_with(config_diffs) # merge to override
-      print(f'Updated config from `CONFIG_DIFFS`: {config_diffs}')
-      return cfg
+      for root_kw in CONFIG_ALLATOM:
+        if root_kw not in config_diffs:
+          continue
+        cfg.merge_with(DictConfig({root_kw:config_diffs[root_kw]})) # merge to override
+        print(f'Updated config from `CONFIG_DIFFS`:{root_kw}: {config_diffs[root_kw]}')
+    
+    return cfg
   
   raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}')
     

From 55e0c19124c655424b41f7ce132fe8262dc4f49f Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Fri, 23 Aug 2024 15:30:36 +0800
Subject: [PATCH 34/42] dev:major: covalent bond

---
 .../helixfold3/data/7s69_glycan.sdf           | 155 ++++++++++++++++++
 .../helixfold3/data/demo_7s69_coval.json      |  19 +++
 .../helixfold/data/pipeline_conf_bonds.py     | 121 ++++++++++++--
 .../infer_scripts/feature_processing_aa.py    |  44 ++---
 .../helixfold/infer_scripts/preprocess.py     |  98 ++++++-----
 5 files changed, 362 insertions(+), 75 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/data/7s69_glycan.sdf
 create mode 100644 apps/protein_folding/helixfold3/data/demo_7s69_coval.json

diff --git a/apps/protein_folding/helixfold3/data/7s69_glycan.sdf b/apps/protein_folding/helixfold3/data/7s69_glycan.sdf
new file mode 100644
index 00000000..8ac6a9e0
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/7s69_glycan.sdf
@@ -0,0 +1,155 @@
+
+ OpenBabel03042416223D
+
+ 72 77  0  0  1  0  0  0  0  0999 V2000
+   29.7340    3.2540   76.7430 C   0  0  0  0  0  2  0  0  0  0  0  0
+   29.8160    4.4760   77.6460 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.5260    5.2840   77.5530 C   0  0  2  0  0  3  0  0  0  0  0  0
+   28.1780    5.5830   76.1020 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.2350    4.3240   75.2420 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.1040    4.6170   73.7650 C   0  0  0  0  0  2  0  0  0  0  0  0
+   31.3020    3.8250   79.4830 C   0  0  0  0  0  0  0  0  0  0  0  0
+   31.3910    3.4410   80.9280 C   0  0  0  0  0  1  0  0  0  0  0  0
+   30.0760    4.0880   79.0210 N   0  0  0  0  0  2  0  0  0  0  0  0
+   28.6870    6.5050   78.2670 O   0  0  0  0  0  1  0  0  0  0  0  0
+   26.8490    6.0910   76.0350 O   0  0  0  0  0  0  0  0  0  0  0  0
+   29.4950    3.6650   75.4130 O   0  0  0  0  0  0  0  0  0  0  0  0
+   29.3670    4.5550   73.1150 O   0  0  0  0  0  1  0  0  0  0  0  0
+   32.2950    3.8940   78.7640 O   0  0  0  0  0  0  0  0  0  0  0  0
+   26.7420    7.4140   75.6950 C   0  0  1  0  0  3  0  0  0  0  0  0
+   25.2700    7.7830   75.6110 C   0  0  1  0  0  3  0  0  0  0  0  0
+   25.1290    9.2300   75.1610 C   0  0  2  0  0  3  0  0  0  0  0  0
+   25.9180   10.1440   76.0880 C   0  0  1  0  0  3  0  0  0  0  0  0
+   27.3630    9.6720   76.2210 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.1310   10.4360   77.2730 C   0  0  0  0  0  2  0  0  0  0  0  0
+   23.8820    5.8170   75.1400 C   0  0  0  0  0  0  0  0  0  0  0  0
+   23.1980    5.0100   74.0810 C   0  0  0  0  0  1  0  0  0  0  0  0
+   24.5530    6.8930   74.7160 N   0  0  0  0  0  2  0  0  0  0  0  0
+   23.7530    9.5950   75.1670 O   0  0  0  0  0  1  0  0  0  0  0  0
+   25.9170   11.4700   75.5730 O   0  0  0  0  0  0  0  0  0  0  0  0
+   27.4050    8.2900   76.6040 O   0  0  0  0  0  0  0  0  0  0  0  0
+   29.5300   10.4030   77.0280 O   0  0  0  0  0  1  0  0  0  0  0  0
+   23.8300    5.5110   76.3290 O   0  0  0  0  0  0  0  0  0  0  0  0
+   25.3940   12.4250   76.4090 C   0  0  1  0  0  3  0  0  0  0  0  0
+   25.9490   13.7680   75.9090 C   0  0  2  0  0  3  0  0  0  0  0  0
+   25.1320   14.9560   76.4900 C   0  0  2  0  0  3  0  0  0  0  0  0
+   23.6130   14.6900   76.6390 C   0  0  1  0  0  3  0  0  0  0  0  0
+   23.3700   13.3000   77.2280 C   0  0  1  0  0  3  0  0  0  0  0  0
+   21.9020   12.9360   77.3500 C   0  0  0  0  0  2  0  0  0  0  0  0
+   25.9010   13.8490   74.4810 O   0  0  0  0  0  1  0  0  0  0  0  0
+   25.3420   16.1410   75.7110 O   0  0  0  0  0  0  0  0  0  0  0  0
+   23.0420   15.6520   77.5170 O   0  0  0  0  0  1  0  0  0  0  0  0
+   23.9910   12.3690   76.3570 O   0  0  0  0  0  0  0  0  0  0  0  0
+   21.3660   12.8480   76.0500 O   0  0  0  0  0  0  0  0  0  0  0  0
+   20.8090   11.6500   75.6780 C   0  0  2  0  0  3  0  0  0  0  0  0
+   20.6800   11.6410   74.1740 C   0  0  2  0  0  3  0  0  0  0  0  0
+   19.5510   12.5850   73.8180 C   0  0  2  0  0  3  0  0  0  0  0  0
+   18.2370   12.0940   74.4540 C   0  0  1  0  0  3  0  0  0  0  0  0
+   18.4030   11.9240   75.9810 C   0  0  1  0  0  3  0  0  0  0  0  0
+   17.2710   11.1260   76.6120 C   0  0  0  0  0  2  0  0  0  0  0  0
+   20.2900   10.3510   73.7080 O   0  0  0  0  0  1  0  0  0  0  0  0
+   19.4280   12.7380   72.4110 O   0  0  0  0  0  0  0  0  0  0  0  0
+   17.2120   13.0460   74.2030 O   0  0  0  0  0  1  0  0  0  0  0  0
+   19.6260   11.2000   76.3010 O   0  0  0  0  0  0  0  0  0  0  0  0
+   16.0670   11.4490   75.9360 O   0  0  0  0  0  1  0  0  0  0  0  0
+   20.2190   13.6280   71.7260 C   0  0  2  0  0  3  0  0  0  0  0  0
+   19.6090   14.0000   70.3810 C   0  0  2  0  0  3  0  0  0  0  0  0
+   19.6360   12.7820   69.4880 C   0  0  2  0  0  3  0  0  0  0  0  0
+   21.0860   12.3100   69.3240 C   0  0  1  0  0  3  0  0  0  0  0  0
+   21.7030   12.0240   70.7120 C   0  0  1  0  0  3  0  0  0  0  0  0
+   23.1940   11.7460   70.6620 C   0  0  0  0  0  2  0  0  0  0  0  0
+   20.4080   14.9810   69.7000 O   0  0  0  0  0  1  0  0  0  0  0  0
+   19.0310   13.0500   68.2340 O   0  0  0  0  0  1  0  0  0  0  0  0
+   21.1060   11.1280   68.5380 O   0  0  0  0  0  1  0  0  0  0  0  0
+   21.5380   13.1700   71.5840 O   0  0  0  0  0  0  0  0  0  0  0  0
+   23.8240   12.5210   71.6820 O   0  0  0  0  0  1  0  0  0  0  0  0
+   26.0070   17.3020   76.0200 C   0  0  2  0  0  3  0  0  0  0  0  0
+   27.0750   17.5250   74.9350 C   0  0  2  0  0  3  0  0  0  0  0  0
+   28.3660   16.8320   75.3290 C   0  0  2  0  0  3  0  0  0  0  0  0
+   28.7820   17.2470   76.7510 C   0  0  1  0  0  3  0  0  0  0  0  0
+   27.6930   16.8120   77.7320 C   0  0  1  0  0  3  0  0  0  0  0  0
+   27.9770   17.2020   79.1710 C   0  0  0  0  0  2  0  0  0  0  0  0
+   27.3990   18.9140   74.8010 O   0  0  0  0  0  1  0  0  0  0  0  0
+   29.4060   17.0990   74.3950 O   0  0  0  0  0  1  0  0  0  0  0  0
+   30.0160   16.6410   77.0930 O   0  0  0  0  0  1  0  0  0  0  0  0
+   26.4610   17.4820   77.3520 O   0  0  0  0  0  0  0  0  0  0  0  0
+   27.3660   18.4620   79.4040 O   0  0  0  0  0  1  0  0  0  0  0  0
+  1  2  1  0  0  0  0
+  1 12  1  0  0  0  0
+  2  3  1  0  0  0  0
+  2  9  1  1  0  0  0
+  3 10  1  1  0  0  0
+  3  4  1  0  0  0  0
+  4  5  1  0  0  0  0
+  4 11  1  1  0  0  0
+  5  6  1  6  0  0  0
+  5 12  1  0  0  0  0
+  6 13  1  0  0  0  0
+  7 14  2  0  0  0  0
+  7  8  1  0  0  0  0
+  7  9  1  0  0  0  0
+ 15 16  1  0  0  0  0
+ 15 11  1  1  0  0  0
+ 15 26  1  0  0  0  0
+ 16 23  1  6  0  0  0
+ 16 17  1  0  0  0  0
+ 17 18  1  0  0  0  0
+ 17 24  1  1  0  0  0
+ 18 25  1  6  0  0  0
+ 18 19  1  0  0  0  0
+ 19 20  1  1  0  0  0
+ 19 26  1  0  0  0  0
+ 20 27  1  0  0  0  0
+ 21 22  1  0  0  0  0
+ 21 23  1  0  0  0  0
+ 21 28  2  0  0  0  0
+ 29 38  1  0  0  0  0
+ 29 25  1  6  0  0  0
+ 29 30  1  0  0  0  0
+ 30 35  1  6  0  0  0
+ 30 31  1  0  0  0  0
+ 31 32  1  0  0  0  0
+ 31 36  1  6  0  0  0
+ 32 33  1  0  0  0  0
+ 32 37  1  1  0  0  0
+ 33 38  1  0  0  0  0
+ 33 34  1  6  0  0  0
+ 34 39  1  0  0  0  0
+ 40 49  1  0  0  0  0
+ 40 41  1  0  0  0  0
+ 40 39  1  1  0  0  0
+ 41 46  1  1  0  0  0
+ 41 42  1  0  0  0  0
+ 42 43  1  0  0  0  0
+ 42 47  1  6  0  0  0
+ 43 48  1  1  0  0  0
+ 43 44  1  0  0  0  0
+ 44 49  1  0  0  0  0
+ 44 45  1  6  0  0  0
+ 45 50  1  0  0  0  0
+ 51 47  1  6  0  0  0
+ 51 60  1  0  0  0  0
+ 51 52  1  0  0  0  0
+ 52 53  1  0  0  0  0
+ 52 57  1  6  0  0  0
+ 53 54  1  0  0  0  0
+ 53 58  1  6  0  0  0
+ 54 59  1  6  0  0  0
+ 54 55  1  0  0  0  0
+ 55 56  1  6  0  0  0
+ 55 60  1  0  0  0  0
+ 56 61  1  0  0  0  0
+ 62 71  1  0  0  0  0
+ 62 36  1  1  0  0  0
+ 62 63  1  0  0  0  0
+ 63 68  1  1  0  0  0
+ 63 64  1  0  0  0  0
+ 64 69  1  6  0  0  0
+ 64 65  1  0  0  0  0
+ 65 70  1  1  0  0  0
+ 65 66  1  0  0  0  0
+ 66 67  1  1  0  0  0
+ 66 71  1  0  0  0  0
+ 67 72  1  0  0  0  0
+M  END
+$$$$
diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
new file mode 100644
index 00000000..fc53c9e9
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
@@ -0,0 +1,19 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "DRHHHHHHKLGKMKIVEEPNSFGLNNPFLSQTNKLQPRVQPSPVSGPSHLFRLAGKCFNLVESTYKYELCPFHNVTQHEQTFRWNAYSGILGIWQEWDIENNTFSGMWMREGDSCGNKNRQTKVLLVCGKANKLSSVSEPSTCLYSLTFETPLVCHPHSLLVYPTLSEGLQEKWNEAEQALYDELITEQGHGKILKEIFREAGYLKTTKPDGEGKETQDKPKEFDSLEKCNKGYTELTSEIQRLKKMLNEHGISYVTNGTSRSEGQPAEVNTTFARGEDKVHLRGDTGIRDGQ",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "sdf": "/repo/PaddleHelix/apps/protein_folding/helixfold3/data/7s69_glycan.sdf",
+            "count": 1
+        },
+        {
+            "type": "bond",
+            "bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3",
+            "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index c8947643..7a9c2960 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -1,19 +1,23 @@
 """Functions for building the input features (reference ccd features) for the HelixFold model."""
 
 import collections
+from dataclasses import dataclass
 import gzip
 import os
 import pickle
-from typing import Any, Optional
+import re
+from typing import Any, List, Literal, Optional, Tuple
 
 from absl import logging
 from immutabledict import immutabledict
-from helixfold.common import residue_constants
 import numpy as np
+from openbabel import openbabel
 
+from helixfold.common import residue_constants
 from helixfold.data.tools import utils
 
 
+
 ALLOWED_LIGAND_BONDS_TYPE = {
     "SING": 1,
     "DOUB": 2,
@@ -22,6 +26,98 @@
     "AROM": 12,
 }
 
+# Define the possible bond types as a Literal
+BondType = Literal["covale", "metal", "hydrogen", "ionic", "disulfide", "aromatic"]
+
+
+@dataclass
+class AtomPartner:
+    """
+    Represents one partner atom in a covalent bond.
+
+    Attributes:
+        label_asym_id (str): The asymmetry identifier for the partner atom (i.e., chain ID).
+        label_comp_id (str): The component identifier for the partner atom (i.e., residue name).
+        seq_id (str): The sequence identifier for the partner atom (merged label_seq_id and auth_seq_id).
+        label_atom_id (str): The atom identifier for the partner atom (i.e., atom name).
+    """
+
+    label_asym_id: str  # Chain ID
+    label_comp_id: str  # Residue name
+    seq_id: str         # Merged sequence ID
+    label_atom_id: str  # Atom name
+
+
+@dataclass
+class CovalentBond:
+    """
+    Represents a covalent bond between two atoms in a molecular structure.
+
+    Attributes:
+        atom_1 (AtomPartner): The first partner atom in the bond.
+        atom_2 (AtomPartner): The second partner atom in the bond.
+        bond_type (BondType): The type of the bond.
+        pdbx_dist_value (float): The distance value as defined in the PDBx/mmCIF format.
+    """
+
+    atom_1: AtomPartner
+    atom_2: AtomPartner
+    bond_type: BondType
+    pdbx_dist_value: float
+
+def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]:
+    """
+    Parses a human-readable string into a list of CovalentBond objects.
+
+    Args:
+        input_string (str): A string representing covalent bonds, where each bond is
+                            described by two atom partners separated by a comma,
+                            and multiple bonds are separated by semicolons.
+                            Example: "A,GLY,25,CA,A,GLY,25,N,covale,1.32; B,HIS,58,ND1,B,HIS,58,CE1,covale,1.39"
+
+    Returns:
+        List[CovalentBond]: A list of CovalentBond objects.
+    """
+    covalent_bonds = []
+
+    # Split the input string by semicolons to separate individual covalent bonds
+    bond_strings = input_string.split(';')
+
+    for bond_str in bond_strings:
+        # Split the individual bond string by commas to separate attributes
+        bond_parts = bond_str.split(',')
+
+        if len(bond_parts) != 10:
+            raise ValueError(f"Invalid bond format: {bond_str}. Expected 10 fields per bond.")
+
+        # Create AtomPartner instances for the two atoms in the bond
+        atom_1 = AtomPartner(
+            label_asym_id=bond_parts[0].strip(),
+            label_comp_id=bond_parts[1].strip(),
+            seq_id=bond_parts[2].strip(),
+            label_atom_id=bond_parts[3].strip()
+        )
+
+        atom_2 = AtomPartner(
+            label_asym_id=bond_parts[4].strip(),
+            label_comp_id=bond_parts[5].strip(),
+            seq_id=bond_parts[6].strip(),
+            label_atom_id=bond_parts[7].strip()
+        )
+
+        # Create a CovalentBond instance
+        covalent_bond = CovalentBond(
+            atom_1=atom_1,
+            atom_2=atom_2,
+            bond_type=bond_parts[8].strip(),
+            pdbx_dist_value=float(bond_parts[9].strip())
+        )
+
+        # Append the CovalentBond instance to the list
+        covalent_bonds.append(covalent_bond)
+
+    return covalent_bonds
+
 def load_ccd_dict(ccd_preprocessed_path: str) -> immutabledict[str, Any]:
     if not os.path.exists(ccd_preprocessed_path):
       raise FileNotFoundError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.')
@@ -151,7 +247,7 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
   return features
 
 
-def make_bond_features(covalent_bond, all_chain_info, ccd_preprocessed_dict, 
+def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_preprocessed_dict, 
                                       extra_feats: Optional[dict]=None):
   """
       all_chain_info: dict, (chain_type_chain_id): ccd_seq (list of ccd), such as: protein_A: ['ALA', 'MET', 'GLY']
@@ -172,24 +268,29 @@ def make_bond_features(covalent_bond, all_chain_info, ccd_preprocessed_dict,
   _set_chain_id_list = set(chain_id_list)
   parsed_covalent_bond = []
   for _bond in covalent_bond:
-    left_bond_atomid, right_bond_atomid = _bond['ptnr1_label_atom_id'], _bond['ptnr2_label_atom_id']
-    left_bond_name, right_bond_name = _bond['ptnr1_label_comp_id'], _bond['ptnr2_label_comp_id']
-    left_bond, right_bond = _bond['ptnr1_label_asym_id'], _bond['ptnr2_label_asym_id']
+    # Accessing the AtomPartner attributes for both atoms in the covalent bond
+    left_bond_atomid, right_bond_atomid = _bond.atom_1.label_atom_id, _bond.atom_2.label_atom_id
+    left_bond_name, right_bond_name = _bond.atom_1.label_comp_id, _bond.atom_2.label_comp_id
+    left_bond, right_bond = _bond.atom_1.label_asym_id, _bond.atom_2.label_asym_id
     
-    left_bond_idx, right_bond_idx = _bond['ptnr1_label_seq_id'], _bond['ptnr2_label_seq_id']
-    auth_left_idx, auth_right_idx = _bond['ptnr1_auth_seq_id'], _bond['ptnr2_auth_seq_id']
+    left_bond_idx, right_bond_idx = _bond.atom_1.seq_id, _bond.atom_2.seq_id
+    auth_left_idx, auth_right_idx = _bond.atom_1.seq_id, _bond.atom_2.seq_id
+
     left_bond_idx = 1 if left_bond_idx == '.' else left_bond_idx
     right_bond_idx = 1 if right_bond_idx == '.' else right_bond_idx
     
-    if _bond['bond_type'] != "covale":
+    if _bond.bond_type != "covale":
+      logging.warning(f'Ignore non-covale bond type: {_bond.bond_type}')
       continue
     
-    if _bond['pdbx_dist_value'] > 2.4:
+    if _bond.pdbx_dist_value > 2.4:
       # the covalent_bond is cut off by distance=2.4
+      logging.warning(f'Ignore bonding with distance > 2.4: {_bond.pdbx_dist_value}')
       continue
     
     ## When some chainID is filtered, bond need to be filtered too.
     if (left_bond not in _set_chain_id_list) or (right_bond not in _set_chain_id_list):
+      logging.warning(f'Ignore bonding with left and right out of chain list: ')
       continue
 
     parsed_covalent_bond.append([left_bond, left_bond_name, left_bond_idx, left_bond_atomid, auth_left_idx,
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
index fd875159..37b09a99 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
@@ -4,7 +4,7 @@
 import os
 from pathlib import Path
 import pickle
-from typing import Optional, Tuple, Any
+from typing import List, Mapping, Optional, Tuple
 
 import numpy as np
 from absl import logging
@@ -20,7 +20,7 @@
 
 from helixfold.data.tools import utils
 
-from .preprocess import digit2alphabet
+from .preprocess import Entity, digit2alphabet
 
 
 POLYMER_STANDARD_RESI_ATOMS = residue_constants.residue_atoms
@@ -226,7 +226,7 @@ def get_inference_restype_mask(all_chain_features, ccd_preprocessed_dict, extra_
   }
 
 
-def add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats=True):
+def add_assembly_features(all_chain_features: Mapping, ccd_preprocessed_dict: Mapping, no_msa_templ_feats:bool=True, covalent_bonds:Optional[List[pipeline_conf_bonds.CovalentBond]]=None):
   '''
     ## NOTE: keep the type and chainID orders.
     all_chain_features: {
@@ -300,7 +300,7 @@ def add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_temp
   ref_features = pipeline_conf_bonds.make_ccd_conf_features(all_chain_info=new_order_chain_infos,
                                                       ccd_preprocessed_dict=ccd_preprocessed_dict,
                                                       extra_feats=extra_feats_infos)
-  bond_features = pipeline_conf_bonds.make_bond_features(covalent_bond=[], 
+  bond_features = pipeline_conf_bonds.make_bond_features(covalent_bond=covalent_bonds, 
                                                       all_chain_info=new_order_chain_infos, 
                                                       ccd_preprocessed_dict=ccd_preprocessed_dict,
                                                       extra_feats=extra_feats_infos)
@@ -369,7 +369,7 @@ def process_chain_msa(args: tuple[pipeline_multimer_parallel.DataPipeline, str,
     return chain_id, raw_features, desc, seq
 
 
-def process_input_json(all_entitys, ccd_preprocessed_path, 
+def process_input_json(all_entitys: List[Entity], ccd_preprocessed_path, 
                           msa_templ_data_pipeline_dict, msa_output_dir,
                           no_msa_templ_feats=False):
 
@@ -378,31 +378,27 @@ def process_input_json(all_entitys, ccd_preprocessed_path,
     all_chain_features = {}
     sequence_features = {} 
     num_chains = 0
-    for entity_items in all_entitys:
+    for entity in all_entitys:
+      if (dtype:=entity.dtype) not in residue_constants.CHAIN_type_order:
+        continue
       # dtype(protein, dna, rna, ligand): no_chains,  msa_seqs, seqs
-      dtype = list(entity_items.keys())[0]
-      items = list(entity_items.values())[0]
-      entity_count = items['count']
-      ccd_seqs = items['seqs']
-      msa_seqs = items['msa_seqs']
-      extra_mol_infos = items.get('extra_mol_infos', {}) ## dict, 「extra-add, ccd_id」: ccd_features.
-
-      for i in range(entity_count):
+
+      for i in range(entity.count):
         chain_num_ids = num_chains + i
         chain_id = digit2alphabet(chain_num_ids) # increase ++
         type_chain_id = dtype + '_' + chain_id
-        if ccd_seqs in sequence_features:
-          all_chain_features[type_chain_id] = copy.deepcopy(sequence_features[ccd_seqs])
+        if entity.seqs in sequence_features:
+          all_chain_features[type_chain_id] = copy.deepcopy(sequence_features[entity.seqs])
           continue
         
-        ccd_list = parsers.parse_ccd_fasta(ccd_seqs)
+        ccd_list = parsers.parse_ccd_fasta(entity.seqs)
         chain_features = {'msa_templ_feats': {},
                           'ccd_seqs': ccd_list, 
-                          'msa_seqs': msa_seqs,
-                          'extra_feats': extra_mol_infos}
+                          'msa_seqs': entity.msa_seqs,
+                          'extra_feats': entity.extra_mol_infos}
         all_chain_features[type_chain_id] = chain_features
-        sequence_features[ccd_seqs] = chain_features
-      num_chains += entity_count
+        sequence_features[entity.seqs] = chain_features
+      num_chains += entity.count
 
     if not no_msa_templ_feats:
       ## 1. get all msa_seqs for protein/rna MSA/Template search. Only for protein/rna.
@@ -456,8 +452,12 @@ def process_input_json(all_entitys, ccd_preprocessed_path,
           for _type_chain_id in fasta_seq_to_type_chain_id[fasta_seq]:
             chain_features['msa_templ_feats'] = copy.deepcopy(seqs_to_msa_features[fasta_seq])
 
+
+    # gather all defined covalent bonds
+    all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if bond.dtype == 'bond']
+
     assert num_chains == len(all_chain_features.keys())
-    all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats)
+    all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats, all_covalent_bonds)
     np_example, label = all_feats['feats'], all_feats['label']
     assert num_chains == len(np.unique(np_example['all_chain_ids']))
 
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index fa30f96a..6aa66b55 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -17,13 +17,17 @@
 import tempfile
 import itertools
 from absl import logging
-from typing import Tuple, Union, Mapping, Literal, Callable, Any
+from typing import List, Optional, Tuple, Union, Mapping, Literal, Callable, Any
+from dataclasses import dataclass, field
 import rdkit
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from helixfold.common import residue_constants
+from helixfold.data.pipeline_conf_bonds import CovalentBond, parse_covalent_bond_input
 from helixfold.data.tools import utils
 
+from openbabel import openbabel
+
 
 ## NOTE: this mapping is only useful for standard dna/rna/protein sequence input.
 # protein, rna, dna, ligand, non_polymer (ion and non_polymer is also the ligand.)
@@ -163,8 +167,19 @@ def _get_supported_formats(self) -> Tuple[str]:
         formats.append('smiles')
         
         return tuple(formats)
+    
+    def _load_mol(self, mol2_file:str, ret:Optional[subprocess.CompletedProcess]=None) -> Chem.Mol:
+        mol = Chem.MolFromMol2File(mol2_file, sanitize=False)
+        if isinstance(ret, subprocess.CompletedProcess) and '3D coordinate generation failed' in ret.stderr:
+            mol = generate_ETKDGv3_conformer(mol)
+        optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
+
+        return optimal_mol_wo_H
 
     def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol:
+        if input_type == 'mol2' and input_value.endswith('.mol2'):
+            return self._load_mol(mol2_file=input_value)
+        
         with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'):
             if input_type == 'smiles':
                 obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
@@ -174,13 +189,8 @@ def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bo
                 obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
             logging.debug(f'Launching command: `{obabel_cmd}`')
             ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
-            mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
-            if '3D coordinate generation failed' in ret.stderr:
-                mol = generate_ETKDGv3_conformer(mol)
-            optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
-
-            return optimal_mol_wo_H
-
+            return self._load_mol(mol2_file=temp_file.name, ret=ret)
+            
     def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol:
         if input_type not in self.supported_formats:
             raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n')
@@ -191,7 +201,16 @@ def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=T
         return self._perform_conversion(input_type, input_value, generate_3d)
 
     __call__: Callable[[str, str, bool], Chem.Mol] = _convert_to_mol
-def polymer_convert(items):
+
+@dataclass
+class Entity:
+    dtype: Literal['protein', 'dna', 'rna', 'ligand', 'bond','non_polymer', 'ion']
+    seqs: str
+    msa_seqs: Union[str, List[CovalentBond]] = ''
+    count: int = 1
+    extra_mol_infos: dict[str, Any] = field(default_factory=dict)
+
+def polymer_convert(items)-> Entity:
     """
         "type": "protein",                          
         "sequence": "GPDSMEEVVVPEEPPKLVSALATYVQQERLCTMFLSIANKLLPLKP",  
@@ -214,18 +233,21 @@ def polymer_convert(items):
             raise ValueError(f'not support for the {dtype} in polymer_convert')
     ccd_seqs = ''.join(ccd_seqs) ## (GLY)(ALA).....
 
-    # repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs]
-    return {
-        dtype: {
-            'seqs': ccd_seqs,
-            'msa_seqs': msa_seqs,
-            'count': count,
-            'extra_mol_infos': {}
-        }
-    }
+    return Entity(dtype=dtype, seqs=ccd_seqs, msa_seqs=msa_seqs,count=count)
 
 
-def ligand_convert(items: Mapping[str, Union[int, str]]):
+def covalent_bond_convert(items: Mapping[str, Union[int, str]]) -> Entity:
+    """
+        "type": "bond",
+        "bond": "A,ASN,74,ND2,B,UNK-,"
+    """
+    dtype = items['type']
+    bond = parse_covalent_bond_input(items['bond'])
+
+    return Entity(dtype=dtype, seqs='', msa_seqs=bond)
+
+
+def ligand_convert(items: Mapping[str, Union[int, str]]) -> Entity:
     """
         "type": "ligand",
         "ccd": "ATP", or "smiles": "CCccc(O)ccc",
@@ -261,14 +283,8 @@ def ligand_convert(items: Mapping[str, Union[int, str]]):
     ccd_seqs = ''.join(_ccd_seqs) ## (GLY)(ALA).....
 
     # repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs]
-    return {
-        'ligand': {
-            'seqs': ccd_seqs,
-            'msa_seqs': msa_seqs,
-            'count': count,
-            'extra_mol_infos': ccd_to_extra_mol_infos,
-        }
-    }
+    return Entity(dtype='ligand', seqs=ccd_seqs,msa_seqs=msa_seqs,count=count,extra_mol_infos=ccd_to_extra_mol_infos)
+
 
 
 def entities_rename_and_filter(items):
@@ -276,39 +292,34 @@ def entities_rename_and_filter(items):
         'ion': 'ligand'
     }
     items['type'] = ligand_mapping.get(items['type'], items['type'])
-    if items['type'] not in ALLOWED_ENTITY_TYPE:
+    if items['type'] not in ALLOWED_ENTITY_TYPE and items['type'] != 'bond':
         raise ValueError(f'{items["type"]} is not allowed, will be ignored.')
     return items
 
 
-def modify_name_convert(entities: list):
+def modify_name_convert(entities: list[Entity]):
     cur_idx = 0
-    for entity_items in entities:
+    for entity in entities:
         # dtype(protein, dna, rna, ligand): no_chains,  msa_seqs, seqs
-        dtype = list(entity_items.keys())[0]
-        items = list(entity_items.values())[0]
-        entity_count = items['count']
-        msa_seqs = items['msa_seqs']
-        extra_mol_infos = items.get('extra_mol_infos', {}) ## dict, 「extra-add, ccd_id」: ccd_features.
 
-        extra_ccd_ids = list(extra_mol_infos.keys())
         ## rename UNK- to UNK-1, 2, 3, 4...
-        for k in extra_ccd_ids:
+        extra_mol_infos=copy.deepcopy(entity.extra_mol_infos)
+        for k in extra_mol_infos:
             user_name_3 = USER_LIG_IDS_3[cur_idx]
-            items['seqs'] = items['seqs'].replace('UNK-', user_name_3)
-            extra_mol_infos[user_name_3] = extra_mol_infos.pop('UNK-')
+            entity.seqs = entity.seqs.replace('UNK-', user_name_3)
+            entity.extra_mol_infos[user_name_3] = entity.extra_mol_infos.pop('UNK-')
             cur_idx += 1
 
     return entities
 
 
-def online_json_to_entity(json_path, out_dir):
+def online_json_to_entity(json_path: str, out_dir: str)-> list[Entity]:
     obj = read_json(json_path)
     entities = copy.deepcopy(obj['entities'])
 
     os.makedirs(out_dir, exist_ok=True)
     error_ids = []
-    success_entity = []
+    success_entity: list[Entity] = []
     for idx, items in enumerate(entities):
         try: 
             items = entities_rename_and_filter(items)
@@ -320,6 +331,8 @@ def online_json_to_entity(json_path, out_dir):
         try:
             if items['type'] == 'ligand':
                 json_obj = ligand_convert(items)
+            elif items['type'] == 'bond':
+                json_obj = covalent_bond_convert(items)
             else:
                 json_obj = polymer_convert(items)
             success_entity.append(json_obj)
@@ -334,5 +347,4 @@ def online_json_to_entity(json_path, out_dir):
     if len(error_ids) > 0:
         raise RuntimeError(f'[Error] Failed to convert {len(error_ids)}/{len(entities)} entities')    
     
-    success_entity = modify_name_convert(success_entity)
-    return success_entity
\ No newline at end of file
+    return modify_name_convert(success_entity)
\ No newline at end of file

From 02a88396273c3ed8de42a9fc8491ad800502c39c Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Fri, 23 Aug 2024 16:54:33 +0800
Subject: [PATCH 35/42] fix: covalent bond

---
 .../helixfold3/data/demo_7s69_coval.json      |  2 +-
 .../helixfold/data/pipeline_conf_bonds.py     | 42 ++++++++++++++++---
 .../infer_scripts/feature_processing_aa.py    |  2 +-
 .../helixfold/infer_scripts/preprocess.py     | 12 ++++--
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
index fc53c9e9..a3890224 100644
--- a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
+++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
@@ -12,7 +12,7 @@
         },
         {
             "type": "bond",
-            "bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3",
+            "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3",
             "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA"
         }
     ]
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index 7a9c2960..9e5b7d2e 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -115,6 +115,7 @@ def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]:
 
         # Append the CovalentBond instance to the list
         covalent_bonds.append(covalent_bond)
+        logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}")
 
     return covalent_bonds
 
@@ -306,6 +307,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
   chainId_to_type = {}
   ligand_bond_type = [] # (i, j, bond_type), represent the bond between token i and token j
   bond_index = [] # (i,j) represent the bond between token i and token j
+
+  ccd_id2atom_ids: dict[str, list]={}
   ccd_standard_set = residue_constants.STANDARD_LIST
   for chain_type_id, ccd_seq in all_chain_info.items():
       chain_type, chain_id = chain_type_id.rsplit('_', 1)
@@ -326,6 +329,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
             else:
                 _ccd_feats = ccd_preprocessed_dict[ccd_id]
             atom_ids = _ccd_feats['atom_ids']
+
+            ccd_id2atom_ids[ccd_id] = atom_ids
             assert len(atom_ids) > 0, f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.'
             
             all_token_nums += len(atom_ids)
@@ -372,16 +377,43 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
           ptnr2_label_seq_id = ptnr2_auth_seq_id
 
       try:
-        assert ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list
+        if not (ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list):
+           raise ValueError(f"Invalid chain id:\n{ptnr1_label_asym_id}/{ptnr2_label_asym_id}\n{chainId_to_ccd_list}")
         ptnr1_ccd_id = chainId_to_ccd_list[ptnr1_label_asym_id][int(ptnr1_label_seq_id) - 1]
         ptnr2_ccd_id = chainId_to_ccd_list[ptnr2_label_asym_id][int(ptnr2_label_seq_id) - 1]
-        assert ptnr1_ccd_id == ptnr1_label_comp_id and ptnr2_ccd_id == ptnr2_label_comp_id
-      except:
+
+
+        # renamed ligand residues
+
+
+        if ptnr1_ccd_id != ptnr1_label_comp_id:
+           logging.warning(f"Find ligand residue: {ptnr1_label_comp_id} -> {ptnr1_ccd_id}")
+           #ptnr1_label_comp_id = ptnr1_ccd_id
+
+        if ptnr2_ccd_id != ptnr2_label_comp_id:
+           logging.warning(f"Find ligand residue: {ptnr2_label_comp_id} -> {ptnr2_ccd_id}")
+           #ptnr2_label_comp_id = ptnr2_ccd_id
+
+      except ValueError as e:
         ## some convalent-bond from mmcif is misslead, pass it.
+        logging.warning(f'Error occurred during covalent bond processing: {e}')
         continue
+
+
       
-      ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids']
-      ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids']
+      if ptnr1_ccd_id in ccd_preprocessed_dict:
+        ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids']
+      else:
+        ptnr1_ccd_atoms_list = ccd_id2atom_ids[ptnr1_ccd_id]
+        
+      logging.debug(f'{ptnr1_ccd_id=}: {ptnr1_ccd_atoms_list=}')
+      
+      if ptnr2_ccd_id in ccd_preprocessed_dict:
+        ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids']
+      else:
+        ptnr2_ccd_atoms_list  = ccd_id2atom_ids[ptnr2_ccd_id]
+
+      logging.debug(f'{ptnr2_ccd_id=}: {ptnr2_ccd_atoms_list=}')
 
       if ptnr1_ccd_id in ccd_standard_set:  
           ## if ccd_id is in the standard residue in HF3 (table 13), we didn't have to map to atom-leval index
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
index 37b09a99..31272c32 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
@@ -454,7 +454,7 @@ def process_input_json(all_entitys: List[Entity], ccd_preprocessed_path,
 
 
     # gather all defined covalent bonds
-    all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if bond.dtype == 'bond']
+    all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if entity.dtype == 'bond']
 
     assert num_chains == len(all_chain_features.keys())
     all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats, all_covalent_bonds)
diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index 6aa66b55..2fffbb96 100644
--- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -180,16 +180,20 @@ def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bo
         if input_type == 'mol2' and input_value.endswith('.mol2'):
             return self._load_mol(mol2_file=input_value)
         
-        with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'):
+        save_path=os.path.join('ligands',f'{os.path.basename(input_value)[:-(len(input_type)+1)] if input_type != "smiles" else "UNK"}.mol2')
+
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        
+        with utils.timing(f'converting {input_type} to mol2: {input_value}'):
             if input_type == 'smiles':
-                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
+                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}"
                 if len(input_value)>60:
                     logging.warning(f'This takes a while ...')
             else:
-                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
+                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}"
             logging.debug(f'Launching command: `{obabel_cmd}`')
             ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
-            return self._load_mol(mol2_file=temp_file.name, ret=ret)
+            return self._load_mol(mol2_file=save_path, ret=ret)
             
     def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol:
         if input_type not in self.supported_formats:

From 2ed4486fd5c9041253cb4be9dd3db11b3deff85c Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Fri, 23 Aug 2024 22:54:03 +0800
Subject: [PATCH 36/42] doc: covalently

---
 apps/protein_folding/helixfold3/README.md     | 43 +++++++++++++++++++
 .../helixfold3/data/demo_7s69_coval.json      |  3 +-
 .../helixfold3/data/demo_p450_heme_coval.json | 25 +++++++++++
 .../helixfold3/helixfold/inference.py         | 20 +++++++++
 .../protein_folding/helixfold3/pyproject.toml |  1 +
 5 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index 639a46be..0eaa1512 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -126,6 +126,49 @@ A example of input data is as follows:
 }
 ```
 
+Another example of **covalently modified** input:
+```json
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "HEM",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
+            "count": 1
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3",
+            "_comment": "<chain-id>,<residue name>,<residue index>,<atom id>,<chain-id>,<residue name>,<residue index>,<atom id>,<bond type>,<bond length>",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
+        }
+    ]
+}
+```
+
+For seaking all atom ids in CCD database:
+```shell
+helixfold_show_ccd +ccd_id=HEM
+```
+
+This command outputs like:
+```text
+# output:
+[2024-08-23 22:44:36,324][absl][INFO] - Started Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz
+[2024-08-23 22:44:43,236][absl][INFO] - Finished Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz in 6.912 seconds
+[2024-08-23 22:44:43,237][absl][INFO] - CCD dataset contains 43488 entries.
+[2024-08-23 22:44:43,237][absl][INFO] - Atoms in HEM: ['CHA', 'CHB', 'CHC', 'CHD', 'C1A', 'C2A', 'C3A', 'C4A', 'CMA', 'CAA', 'CBA', 'CGA', 'O1A', 'O2A', 'C1B', 'C2B', 'C3B', 'C4B', 'CMB', 'CAB', 'CBB', 'C1C', 'C2C', 'C3C', 'C4C', 'CMC', 'CAC', 'CBC', 'C1D', 'C2D', 'C3D', 'C4D', 'CMD', 'CAD', 'CBD', 'CGD', 'O1D', 'O2D', 'NA', 'NB', 'NC', 'ND', 'FE']
+```
+
 #### Running HelixFold for Inference
 To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.:
 
diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
index a3890224..7b2550ff 100644
--- a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
+++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
@@ -13,7 +13,8 @@
         {
             "type": "bond",
             "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3",
-            "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA"
+            "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA.",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
         }
     ]
 }
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json
new file mode 100644
index 00000000..6761a4bc
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json
@@ -0,0 +1,25 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "HEM",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
+            "count": 1
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3",
+            "_comment": "<chain-id>,<residue name>,<residue index>,<atom id>,<chain-id>,<residue name>,<residue index>,<atom id>,<bond type>,<bond length>",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index e18029a5..8a862279 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -31,6 +31,7 @@
 import hydra
 
 from helixfold.common import all_atom_pdb_save
+from helixfold.data.pipeline_conf_bonds import load_ccd_dict
 from helixfold.model import config, utils
 from helixfold.data import pipeline_parallel as pipeline
 from helixfold.data import pipeline_multimer_parallel as pipeline_multimer
@@ -565,5 +566,24 @@ def main(cfg: DictConfig):
     ranking_all_predictions(all_pred_path)
     print(f'============ Inference finished ! ============')
 
+
+@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold')
+def show_atom_id_ccd(cfg: DictConfig):
+    ccd_preprocessed_path = cfg.db.ccd_preprocessed
+    
+
+    ccd_id=cfg.ccd_id
+    if len(ccd_id) <= 3 and ccd_id in (ccd_dict:=load_ccd_dict(ccd_preprocessed_path)):
+        logging.info(f'Atoms in {ccd_id}: {ccd_dict[ccd_id]["atom_ids"]}')
+        return
+
+
+
+    
+    
+
+
+
+
 if __name__ == '__main__':
     main()
diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml
index 8c6dc9f7..33956227 100644
--- a/apps/protein_folding/helixfold3/pyproject.toml
+++ b/apps/protein_folding/helixfold3/pyproject.toml
@@ -47,3 +47,4 @@ joblib = "1.4.2"
 
 [tool.poetry.scripts]
 helixfold = 'helixfold.inference:main'
+helixfold_show_ccd = 'helixfold.inference:show_atom_id_ccd'

From b718ced5d4f3137fae795647b9f349853fda400a Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 24 Aug 2024 11:18:43 +0800
Subject: [PATCH 37/42] Update README.md

---
 apps/protein_folding/helixfold3/README.md | 122 ++++++++++++++++++++--
 1 file changed, 112 insertions(+), 10 deletions(-)

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index 0eaa1512..8c810ccd 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -72,14 +72,22 @@ Note: If you have a different version of python3 and cuda, please refer to [here
 #### Install Maxit
 The conversion between `.cif` and `.pdb` relies on [Maxit](https://sw-tools.rcsb.org/apps/MAXIT/index.html). 
 Download Maxit source code from https://sw-tools.rcsb.org/apps/MAXIT/maxit-v11.100-prod-src.tar.gz. Untar and follow 
-its `README` to complete installation. 
+its `README` to complete installation. If you encouter error like your GCC version not support (9.4.0, for example), editing `etc/platform.sh` and reruning compilation again would make sense. See below:
+
+```bash
+#   Check if it is a Linux platform
+    Linux)
+#     Check if it is GCC version 4.x
+      gcc_ver=`gcc --version | grep -e " 4\."` # edit `4\.` to `9\.`
+      if [[ -z $gcc_ver ]]
+```
 
 ### Usage
 
 In order to run HelixFold3, the genetic databases and model parameters are required.
 
 The parameters of HelixFold3 can be downloaded [here](https://paddlehelix.bd.bcebos.com/HelixFold3/params/HelixFold3-params-240814.zip), 
-please place the downloaded checkpoint in ```./init_models/ ```directory.
+please place the downloaded checkpoint path in `weight_path` of `helixfold/config/helixfold.yaml` configuration file before install HF3 as a python module.
 
 The script `scripts/download_all_data.sh` can be used to download and set up all genetic databases with the following configs:
 
@@ -109,6 +117,7 @@ HelixFold3 supports input ligand as SMILES, CCD id or small molecule files, plea
 for more details about SMILES input. Flexible input from small molecule is now supported. See `obabel -L formats |grep -v 'Write-only'`
 
 A example of input data is as follows:
+
 ```json
 {
     "entities": [
@@ -127,6 +136,7 @@ A example of input data is as follows:
 ```
 
 Another example of **covalently modified** input:
+
 ```json
 {
     "entities": [
@@ -149,18 +159,21 @@ Another example of **covalently modified** input:
             "type": "bond",
             "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3",
             "_comment": "<chain-id>,<residue name>,<residue index>,<atom id>,<chain-id>,<residue name>,<residue index>,<atom id>,<bond type>,<bond length>",
-            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
+            "_another_comment": "use semicolon to separate multiple bonds",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input. eg. `UNK-1` for the first ligand chain(or the count #1), `UNK-2` the second(or the count #2)."
         }
     ]
 }
 ```
 
 For seaking all atom ids in CCD database:
+
 ```shell
 helixfold_show_ccd +ccd_id=HEM
 ```
 
 This command outputs like:
+
 ```text
 # output:
 [2024-08-23 22:44:36,324][absl][INFO] - Started Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz
@@ -169,10 +182,97 @@ This command outputs like:
 [2024-08-23 22:44:43,237][absl][INFO] - Atoms in HEM: ['CHA', 'CHB', 'CHC', 'CHD', 'C1A', 'C2A', 'C3A', 'C4A', 'CMA', 'CAA', 'CBA', 'CGA', 'O1A', 'O2A', 'C1B', 'C2B', 'C3B', 'C4B', 'CMB', 'CAB', 'CBB', 'C1C', 'C2C', 'C3C', 'C4C', 'CMC', 'CAC', 'CBC', 'C1D', 'C2D', 'C3D', 'C4D', 'CMD', 'CAD', 'CBD', 'CGD', 'O1D', 'O2D', 'NA', 'NB', 'NC', 'ND', 'FE']
 ```
 
+For seaking all atom ids in a given `sdf`/`mol2`, the atom list follows the same order in its file.
+
+HF3 parsed:
+
+```text
+['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'N1', 'O1', 'O2', 'O3', 'O4', 'O5', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'N2', 'O6', 'O7', 'O8', 'O9', 'O10', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'O11', 'O12', 'O13', 'O14', 'O15', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'O16', 'O17', 'O18', 'O19', 'O20', 'C29', 'C30', 'C31', 'C32', 'C33', 'C34', 'O21', 'O22', 'O23', 'O24', 'O25', 'C35', 'C36', 'C37', 'C38', 'C39', 'C40', 'O26', 'O27', 'O28', 'O29', 'O30']
+```
+
+while in `SDF`:
+
+```text
+   29.7340    3.2540   76.7430 C   0  0  0  0  0  2  0  0  0  0  0  0
+   29.8160    4.4760   77.6460 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.5260    5.2840   77.5530 C   0  0  2  0  0  3  0  0  0  0  0  0
+   28.1780    5.5830   76.1020 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.2350    4.3240   75.2420 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.1040    4.6170   73.7650 C   0  0  0  0  0  2  0  0  0  0  0  0
+   31.3020    3.8250   79.4830 C   0  0  0  0  0  0  0  0  0  0  0  0
+   31.3910    3.4410   80.9280 C   0  0  0  0  0  1  0  0  0  0  0  0
+   30.0760    4.0880   79.0210 N   0  0  0  0  0  2  0  0  0  0  0  0
+   28.6870    6.5050   78.2670 O   0  0  0  0  0  1  0  0  0  0  0  0
+   26.8490    6.0910   76.0350 O   0  0  0  0  0  0  0  0  0  0  0  0
+   29.4950    3.6650   75.4130 O   0  0  0  0  0  0  0  0  0  0  0  0
+   29.3670    4.5550   73.1150 O   0  0  0  0  0  1  0  0  0  0  0  0
+   32.2950    3.8940   78.7640 O   0  0  0  0  0  0  0  0  0  0  0  0
+   26.7420    7.4140   75.6950 C   0  0  1  0  0  3  0  0  0  0  0  0
+   25.2700    7.7830   75.6110 C   0  0  1  0  0  3  0  0  0  0  0  0
+   25.1290    9.2300   75.1610 C   0  0  2  0  0  3  0  0  0  0  0  0
+   25.9180   10.1440   76.0880 C   0  0  1  0  0  3  0  0  0  0  0  0
+   27.3630    9.6720   76.2210 C   0  0  1  0  0  3  0  0  0  0  0  0
+   28.1310   10.4360   77.2730 C   0  0  0  0  0  2  0  0  0  0  0  0
+   23.8820    5.8170   75.1400 C   0  0  0  0  0  0  0  0  0  0  0  0
+   23.1980    5.0100   74.0810 C   0  0  0  0  0  1  0  0  0  0  0  0
+   24.5530    6.8930   74.7160 N   0  0  0  0  0  2  0  0  0  0  0  0
+   23.7530    9.5950   75.1670 O   0  0  0  0  0  1  0  0  0  0  0  0
+   25.9170   11.4700   75.5730 O   0  0  0  0  0  0  0  0  0  0  0  0
+   27.4050    8.2900   76.6040 O   0  0  0  0  0  0  0  0  0  0  0  0
+   29.5300   10.4030   77.0280 O   0  0  0  0  0  1  0  0  0  0  0  0
+   23.8300    5.5110   76.3290 O   0  0  0  0  0  0  0  0  0  0  0  0
+   25.3940   12.4250   76.4090 C   0  0  1  0  0  3  0  0  0  0  0  0
+   25.9490   13.7680   75.9090 C   0  0  2  0  0  3  0  0  0  0  0  0
+   25.1320   14.9560   76.4900 C   0  0  2  0  0  3  0  0  0  0  0  0
+   23.6130   14.6900   76.6390 C   0  0  1  0  0  3  0  0  0  0  0  0
+   23.3700   13.3000   77.2280 C   0  0  1  0  0  3  0  0  0  0  0  0
+   21.9020   12.9360   77.3500 C   0  0  0  0  0  2  0  0  0  0  0  0
+   25.9010   13.8490   74.4810 O   0  0  0  0  0  1  0  0  0  0  0  0
+   25.3420   16.1410   75.7110 O   0  0  0  0  0  0  0  0  0  0  0  0
+   23.0420   15.6520   77.5170 O   0  0  0  0  0  1  0  0  0  0  0  0
+   23.9910   12.3690   76.3570 O   0  0  0  0  0  0  0  0  0  0  0  0
+   21.3660   12.8480   76.0500 O   0  0  0  0  0  0  0  0  0  0  0  0
+   20.8090   11.6500   75.6780 C   0  0  2  0  0  3  0  0  0  0  0  0
+   20.6800   11.6410   74.1740 C   0  0  2  0  0  3  0  0  0  0  0  0
+   19.5510   12.5850   73.8180 C   0  0  2  0  0  3  0  0  0  0  0  0
+   18.2370   12.0940   74.4540 C   0  0  1  0  0  3  0  0  0  0  0  0
+   18.4030   11.9240   75.9810 C   0  0  1  0  0  3  0  0  0  0  0  0
+   17.2710   11.1260   76.6120 C   0  0  0  0  0  2  0  0  0  0  0  0
+   20.2900   10.3510   73.7080 O   0  0  0  0  0  1  0  0  0  0  0  0
+   19.4280   12.7380   72.4110 O   0  0  0  0  0  0  0  0  0  0  0  0
+   17.2120   13.0460   74.2030 O   0  0  0  0  0  1  0  0  0  0  0  0
+   19.6260   11.2000   76.3010 O   0  0  0  0  0  0  0  0  0  0  0  0
+   16.0670   11.4490   75.9360 O   0  0  0  0  0  1  0  0  0  0  0  0
+   20.2190   13.6280   71.7260 C   0  0  2  0  0  3  0  0  0  0  0  0
+   19.6090   14.0000   70.3810 C   0  0  2  0  0  3  0  0  0  0  0  0
+   19.6360   12.7820   69.4880 C   0  0  2  0  0  3  0  0  0  0  0  0
+   21.0860   12.3100   69.3240 C   0  0  1  0  0  3  0  0  0  0  0  0
+   21.7030   12.0240   70.7120 C   0  0  1  0  0  3  0  0  0  0  0  0
+   23.1940   11.7460   70.6620 C   0  0  0  0  0  2  0  0  0  0  0  0
+   20.4080   14.9810   69.7000 O   0  0  0  0  0  1  0  0  0  0  0  0
+   19.0310   13.0500   68.2340 O   0  0  0  0  0  1  0  0  0  0  0  0
+   21.1060   11.1280   68.5380 O   0  0  0  0  0  1  0  0  0  0  0  0
+   21.5380   13.1700   71.5840 O   0  0  0  0  0  0  0  0  0  0  0  0
+   23.8240   12.5210   71.6820 O   0  0  0  0  0  1  0  0  0  0  0  0
+   26.0070   17.3020   76.0200 C   0  0  2  0  0  3  0  0  0  0  0  0
+   27.0750   17.5250   74.9350 C   0  0  2  0  0  3  0  0  0  0  0  0
+   28.3660   16.8320   75.3290 C   0  0  2  0  0  3  0  0  0  0  0  0
+   28.7820   17.2470   76.7510 C   0  0  1  0  0  3  0  0  0  0  0  0
+   27.6930   16.8120   77.7320 C   0  0  1  0  0  3  0  0  0  0  0  0
+   27.9770   17.2020   79.1710 C   0  0  0  0  0  2  0  0  0  0  0  0
+   27.3990   18.9140   74.8010 O   0  0  0  0  0  1  0  0  0  0  0  0
+   29.4060   17.0990   74.3950 O   0  0  0  0  0  1  0  0  0  0  0  0
+   30.0160   16.6410   77.0930 O   0  0  0  0  0  1  0  0  0  0  0  0
+   26.4610   17.4820   77.3520 O   0  0  0  0  0  0  0  0  0  0  0  0
+   27.3660   18.4620   79.4040 O   0  0  0  0  0  1  0  0  0  0  0  0
+```
+
 #### Running HelixFold for Inference
+
 To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.:
 
 ##### Run from default config
+
 ```shell
 LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
 helixfold \
@@ -182,6 +282,7 @@ helixfold \
 ```
 
 ##### Run with customized configuration dir and file(`./myfold.yaml`, for example):
+
 ```shell
 LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
 helixfold --config-dir=. --config-name=myfold \
@@ -191,14 +292,15 @@ helixfold --config-dir=. --config-name=myfold \
 ```
 
 ##### Run with additional configuration term 
+
 ```shell
 LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
 helixfold \
     input=./data/demo_6zcy.json \
     output=. \
-    CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \
-    CONFIG_DIFFS.model.global_config.subbatch_size=192 \
-    CONFIG_DIFFS.model.num_recycle=10
+    CONFIG_DIFFS.preset=allatom_demo \
+    +CONFIG_DIFFS.model.global_config.subbatch_size=192 \
+    +CONFIG_DIFFS.model.num_recycle=10
 ```
 
 The descriptions of the above script are as follows:
@@ -216,7 +318,7 @@ The outputs will be in a subfolder of `output_dir`, including the computed MSAs,
 ranked structures, and evaluation metrics. For a task of inferring twice with diffusion batch size 3, 
 assume your input JSON is named `demo_data.json`, the `output_dir` directory will have the following structure:
 
-```
+```text
 <output_dir>/
 └── demo_data/
     ├── demo_data-pred-1-1/
@@ -240,9 +342,10 @@ assume your input JSON is named `demo_data.json`, the `output_dir` directory wil
         └── ...
 
 ```
+
 The contents of each output file are as follows:
 * `final_features.pkl` – A `pickle` file containing the input feature NumPy arrays
- used by the models to predict the structures.
+ used by the models to predict the structures. If you need to re-run a inference without re-building the MSAs, delete this file.
 * `msas/` - A directory containing the files describing the various genetic
  tool hits that were used to construct the input MSA.
 * `demo_data-pred-X-Y` - Prediction results of `demo_data.json` in X-th inference and Y-thdiffusion batch, 
@@ -256,8 +359,7 @@ We suggest a single GPU for inference has at least 32G available memory. The max
 single V100-32G with precision `fp32` is up to 1000. Inferring longer tokens or entities with larger atom numbers 
 per token than normal protein residues like nucleic acids may cost more GPU memory.
 
-For samples with larger tokens, you can reduce `model.global_config.subbatch_size` in `CONFIG_DIFFS` in `helixfold/model/config.py` to save more GPU memory but suffer from slower inference. `model.global_config.subbatch_size` is set as `96` by default. You can also
-reduce the number of additional recycles by changing `model.num_recycle` in the same place.
+For samples with larger tokens, you can override `model.global_config.subbatch_size` in `CONFIG_ALLATOM` by using `+CONFIG_DIFFS.model.global_config.subbatch_size=X` on command runs, where `X` is a smaller number than `96`, to save more GPU memory although this will cause a slower inference. Additionally, you can reduce the number of additional recycles by setting `+CONFIG_DIFFS.model.num_recycle=Y`, where `Y` is a smaller number than `3`.
 
 
 We are keen on support longer token inference, it will come in soon.

From b010c13a490f6b21677c28d4830edf2ad4417a38 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 24 Aug 2024 13:00:54 +0800
Subject: [PATCH 38/42] feat: disulfide bonding

---
 .../helixfold3/data/demo_disulf.json               | 14 ++++++++++++++
 .../helixfold/data/pipeline_conf_bonds.py          |  2 +-
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_disulf.json

diff --git a/apps/protein_folding/helixfold3/data/demo_disulf.json b/apps/protein_folding/helixfold3/data/demo_disulf.json
new file mode 100644
index 00000000..780b9d7d
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_disulf.json
@@ -0,0 +1,14 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MASVKSSSSSSSSSFISLLLLILLVIVLQSQVIECQPQQSCTASLTGLNVCAPFLVPGSPTASTECCNAVQSINHDCMCNTMRIAAQIPAQCNLPPLSCSAN",
+            "count": 1
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,41,SG,A,CYS,77,SG,disulf,2.2;A,CYS,51,SG,A,CYS,66,SG,disulf,2.2;A,CYS,67,SG,A,CYS,92,SG,disulf,2.2;A,CYS,79,SG,A,CYS,99,SG,disulf,2.2",
+            "_case_from": "https://www.uniprot.org/uniprotkb/Q43495/entry#ptm_processing"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index 9e5b7d2e..a7d2708e 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -115,7 +115,7 @@ def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]:
 
         # Append the CovalentBond instance to the list
         covalent_bonds.append(covalent_bond)
-        logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}")
+    logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}")
 
     return covalent_bonds
 

From 22bbe7ade88f6547571a4f8e09df849e727ce6a4 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 24 Aug 2024 13:35:30 +0800
Subject: [PATCH 39/42] feat: disulfide bonding

---
 .../data/demo_disulf_homodimer.json           | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json

diff --git a/apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json b/apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json
new file mode 100644
index 00000000..fd56fcd4
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_disulf_homodimer.json
@@ -0,0 +1,33 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "NSVHPCCDPVICEPREGEHCISGPCCENCYFLNSGTICKRARGDGNQDYCTGITPDCPRNRYNV",
+            "count": 2
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,6,SG,A,CYS,29,SG,disulf,2.3;A,CYS,20,SG,A,CYS,26,SG,disulf,2.3;A,CYS,25,SG,A,CYS,50,SG,disulf,2.3;A,CYS,38,SG,A,CYS,57,SG,disulf,2.3",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing",
+            "_note": "Intrachain, A"
+        },
+        {
+            "type": "bond",
+            "bond": "B,CYS,6,SG,B,CYS,29,SG,disulf,2.3;B,CYS,20,SG,B,CYS,26,SG,disulf,2.3;B,CYS,25,SG,B,CYS,50,SG,disulf,2.3;B,CYS,38,SG,B,CYS,57,SG,disulf,2.3",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing",
+            "_note": "Intrachain, B"
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,7,SG,B,CYS,12,SG,disulf,2.3",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing",
+            "_note": "Interchain, AB"
+        },
+        {
+            "type": "bond",
+            "bond": "B,CYS,7,SG,A,CYS,12,SG,disulf,2.3",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P83658/entry#ptm_processing",
+            "_note": "Interchain, BA"
+        }
+    ]
+}
\ No newline at end of file

From 0026e5d634cc63260a268ef15bf3d13ac47c8056 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 24 Aug 2024 21:12:45 +0800
Subject: [PATCH 40/42] case: metalc

---
 .../helixfold3/data/demo_4Fe-4S.json          | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 apps/protein_folding/helixfold3/data/demo_4Fe-4S.json

diff --git a/apps/protein_folding/helixfold3/data/demo_4Fe-4S.json b/apps/protein_folding/helixfold3/data/demo_4Fe-4S.json
new file mode 100644
index 00000000..4599c4af
--- /dev/null
+++ b/apps/protein_folding/helixfold3/data/demo_4Fe-4S.json
@@ -0,0 +1,45 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MKLNVDGLLVYFPYDYIYPEQFSYMRELKRTLDAKGHGVLEMPSGTGKTVSLLALIMAYQRAYPLEVTKLIYCSRTVPEIEKVIEELRKLLNFYEKQEGEKLPFLGLALSSRKNLCIHPEVTPLRFGKDVDGKCHSLTASYVRAQYQHDTSLPHCRFYEEFDAHGREVPLPAGIYNLDDLKALGRRQGWCPYFLARYSILHANVVVYSYHYLLDPKIADLVSKELARKAVVVFDEAHNIDNVCIDSMSVNLTRRTLDRCQGNLETLQKTVLRIKETDEQRLRDEYRRLVEGLREASAARETDAHLANPVLPDEVLQEAVPGSIRTAEHFLGFLRRLLEYVKWRLRVQHVVQESPPAFLSGLAQRVCIQRKPLRFCAERLRSLLHTLEITDLADFSPLTLLANFATLVSTYAKGFTIIIEPFDDRTPTIANPILHFSCMDASLAIKPVFERFQSVIITSGTLSPLDIYPKILDFHPVTMATFTMTLARVCLCPMIIGRGNDQVAISSKFETREDIAVIRNYGNLLLEMSAVVPDGIVAFFTSYQYMESTVASWYEQGILENIQRNKLLFIETQDGAETSVALEKYQEACENGRGAILLSVARGKVSEGIDFVHHYGRAVIMFGVPYVYTQSRILKARLEYLRDQFQIRENDFLTFDAMRHAAQCVGRAIRGKTDYGLMVFADKRFARGDKRGKLPRWIQEHLTDANLNLTVDEGVQVAKYFLRQMAQPFHREDQLGLSLLSLEQLESEETLKRIEQIAQQL",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "SF4",
+            "count": 1,
+            "_note": "5T5I"
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,116,SG,B,SF4,1,FE1,metalc,2.2;A,CYS,134,SG,B,SF4,1,FE2,metalc,2.2;A,CYS,155,SG,B,SF4,1,FE3,metalc,2.2;A,CYS,190,SG,B,SF4,1,FE4,metalc,2.2",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry",
+            "_note":"ALL_CYS-ALL_FE"
+        },
+        {
+            "type": "bond",
+            "bond": "B,SF4,1,FE1,B,SF4,1,S2,metalc,2.2;B,SF4,1,FE1,B,SF4,1,S3,metalc,2.2;B,SF4,1,FE1,B,SF4,1,S4,metalc,2.2",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry",
+            "_note":"FE1-S234"
+        },
+        {
+            "type": "bond",
+            "bond": "B,SF4,1,FE2,B,SF4,1,S1,metalc,2.2;B,SF4,1,FE2,B,SF4,1,S3,metalc,2.2;B,SF4,1,FE2,B,SF4,1,S4,metalc,2.2",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry",
+            "_note":"FE2-S134"
+        },
+        {
+            "type": "bond",
+            "bond": "B,SF4,1,FE3,B,SF4,1,S1,metalc,2.2;B,SF4,1,FE3,B,SF4,1,S2,metalc,2.2;B,SF4,1,FE3,B,SF4,1,S4,metalc,2.2",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry",
+            "_note":"FE3-S124"
+        },
+        {
+            "type": "bond",
+            "bond": "B,SF4,1,FE4,B,SF4,1,S1,metalc,2.2;B,SF4,1,FE14,B,SF4,1,S2,metalc,2.2;B,SF4,1,FE4,B,SF4,1,S3,metalc,2.2",
+            "_case_from": "https://www.uniprot.org/uniprotkb/P18074/entry",
+            "_note":"FE4-S123"
+        }
+    ]
+}
\ No newline at end of file

From 658cf28bc238cd3808ac8de3748b063cb8bd143f Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 24 Aug 2024 12:26:00 +0800
Subject: [PATCH 41/42] Update pipeline_conf_bonds.py

---
 .../helixfold3/helixfold/data/pipeline_conf_bonds.py       | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
index a7d2708e..6b46bb18 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -27,7 +27,8 @@
 }
 
 # Define the possible bond types as a Literal
-BondType = Literal["covale", "metal", "hydrogen", "ionic", "disulfide", "aromatic"]
+# https://mmcif.wwpdb.org/dictionaries/mmcif_std.dic/Items/_struct_conn_type.id.html
+BondType = Literal["covale", "covale_base", "covale_phosphate", "covale_sugar", "disulf", "hydrog",'metalc','mismat','modres','saltbr']
 
 
 @dataclass
@@ -281,8 +282,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
     right_bond_idx = 1 if right_bond_idx == '.' else right_bond_idx
     
     if _bond.bond_type != "covale":
-      logging.warning(f'Ignore non-covale bond type: {_bond.bond_type}')
-      continue
+      logging.warning(f'Detected non-covale bond type: {_bond.bond_type}')
+      # continue
     
     if _bond.pdbx_dist_value > 2.4:
       # the covalent_bond is cut off by distance=2.4

From ab7a4242d90527bc8864f742597a82845c15b9df Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:27:45 +0800
Subject: [PATCH 42/42] chore: cpu only for msa only

---
 .../helixfold3/helixfold/inference.py                | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index 8a862279..57fd692c 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -453,6 +453,12 @@ def split_prediction(pred, rank):
 def main(cfg: DictConfig):
     logging.set_verbosity(cfg.logging_level)
 
+    if cfg.msa_only == True:
+        logging.warning(f'Model inference will be skipped because MSA-only mode is required.')
+        logging.warning(f'Use CPU only')
+        paddle.device.set_device("cpu")
+        
+
     """main function"""
     new_einsum = os.getenv("FLAGS_new_einsum", True)
     print(f'>>> PaddlePaddle commit: {paddle.version.commit}')
@@ -505,6 +511,8 @@ def main(cfg: DictConfig):
             model.helixfold.set_state_dict(pd_params['model'])
         else:
             model.helixfold.set_state_dict(pd_params)
+
+    
     
     if cfg.precision == "bf16" and cfg.amp_level == "O2":
         raise NotImplementedError("bf16 O2 is not supported yet.")
@@ -531,6 +539,10 @@ def main(cfg: DictConfig):
         with open(features_pkl, 'wb') as f:
             pickle.dump(feature_dict, f, protocol=4)
 
+    if cfg.msa_only == True:
+        logging.warning(f'Model inference is skipped because MSA-only mode is required.')
+        exit()
+
     feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True)
     feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True)