From 6346752a654ece5b53cff8683c28b8c33b484a40 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:34:27 +0800
Subject: [PATCH 1/6] chore: hydra and pip

---
 .../helixfold/common/all_atom_pdb_save.py     |   7 +-
 .../helixfold/config/helixfold.yaml           |  59 ++++
 .../infer_scripts/feature_processing_aa.py    |   0
 .../infer_scripts/preprocess.py               |  13 +-
 .../infer_scripts/tools/mmcif_writer.py       |   0
 .../helixfold3/{ => helixfold}/inference.py   | 289 +++++++-----------
 .../{ => helixfold}/utils/__init__.py         |   0
 .../helixfold3/{ => helixfold}/utils/misc.py  |   0
 .../helixfold3/{ => helixfold}/utils/model.py |   0
 .../helixfold3/{ => helixfold}/utils/utils.py |   0
 .../protein_folding/helixfold3/pyproject.toml |  48 +++
 apps/protein_folding/helixfold3/setup_env.sh  |  18 ++
 12 files changed, 248 insertions(+), 186 deletions(-)
 create mode 100644 apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
 rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/feature_processing_aa.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/preprocess.py (97%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/tools/mmcif_writer.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/inference.py (64%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/__init__.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/misc.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/model.py (100%)
 rename apps/protein_folding/helixfold3/{ => helixfold}/utils/utils.py (100%)
 create mode 100644 apps/protein_folding/helixfold3/pyproject.toml
 create mode 100644 apps/protein_folding/helixfold3/setup_env.sh

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index deb8e087..9c9f288e 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -164,10 +164,13 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
     - maxit_binary: path to maxit_binary, use to convert pdb to cif
     - mmcif_path: path to save *.cif
   """
-  assert maxit_binary is not None and os.path.exists(maxit_binary), (
+  if os.path.isfile(maxit_binary):
+    raise FileNotFoundError(
       f'maxit_binary: {maxit_binary} not exists. '
       f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html')
-  assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}'
+  
+  if not mmcif_path.endswith('.cif'):
+     raise ValueError(f'mmcif_path should endswith .cif; got {mmcif_path}')
 
   pdb_path = mmcif_path.replace('.cif', '.pdb')
   pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path)
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
new file mode 100644
index 00000000..fd70ada0
--- /dev/null
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -0,0 +1,59 @@
+defaults:
+  - _self_
+
+# General configuration
+
+bf16_infer: false  # Corresponds to --bf16_infer
+seed: null  # Corresponds to --seed
+logging_level: DEBUG  # Corresponds to --logging_level
+job_id: 'structure_prediction'  # Corresponds to --model_name
+weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams  # Corresponds to --init_model
+precision: fp32  # Corresponds to --precision
+amp_level: O1  # Corresponds to --amp_level
+infer_times: 1  # Corresponds to --infer_times
+diff_batch_size: -1  # Corresponds to --diff_batch_size
+use_small_bfd: false # Corresponds to --use_small_bfd
+
+# File paths
+
+input: null  # Corresponds to --input_json, required field
+output: null  # Corresponds to --output_dir, required field
+
+
+# Binary tool paths, leave them as null to find proper ones under PATH or conda bin path
+bin:
+  jackhmmer: null    # Corresponds to --jackhmmer_binary_path
+  hhblits: null  # Corresponds to --hhblits_binary_path
+  hhsearch: null   # Corresponds to --hhsearch_binary_path
+  kalign: null  # Corresponds to --kalign_binary_path
+  hmmsearch: null  # Corresponds to --hmmsearch_binary_path
+  hmmbuild: null  # Corresponds to --hmmbuild_binary_path
+  nhmmer: null  # Corresponds to --nhmmer_binary_path
+  obabel: null
+
+# Database paths
+db:
+  uniprot: /mnt/db/uniprot/uniprot.fasta  # Corresponds to --uniprot_database_path, required field
+  pdb_seqres: /mnt/db/pdb_seqres/pdb_seqres.txt  # Corresponds to --pdb_seqres_database_path, required field
+  uniref90: /mnt/db/uniref90/uniref90.fasta  # Corresponds to --uniref90_database_path, required field
+  mgnify: /mnt/db/mgnify/mgy_clusters.fa  # Corresponds to --mgnify_database_path, required field
+  bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt  # Corresponds to --bfd_database_path
+  small_bfd: null  # Corresponds to --small_bfd_database_path
+  uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02  # Corresponds to --uniclust30_database_path
+  rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta  # Corresponds to --rfam_database_path, required field
+  ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz  # Corresponds to --ccd_preprocessed_path, required field
+
+# Template and PDB information
+template:
+  mmcif_dir: /mnt/db/pdb_mmcif/mmcif_files  # Corresponds to --template_mmcif_dir, required field
+  max_date: '2023-03-15'  # Corresponds to --max_template_date, required field
+  obsolete_pdbs: /mnt/db/pdb_mmcif/obsolete.dat  # Corresponds to --obsolete_pdbs_path, required field
+
+# Preset configuration
+preset:
+  preset: full_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
+
+# Other configurations
+other:
+  maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
+  no_msa_templ_feats: false  # Corresponds to --no_msa_templ_feats
diff --git a/apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
similarity index 100%
rename from apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py
rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
diff --git a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
similarity index 97%
rename from apps/protein_folding/helixfold3/infer_scripts/preprocess.py
rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
index 41cd44ac..eb8eb14f 100644
--- a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py
+++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -5,17 +5,17 @@
         'seqs': ccd_seqs,
         'msa_seqs': msa_seqs,
         'count': count,
-        'extra_mol_infos': {}， for which seqs has the modify residue type or smiles.
+        'extra_mol_infos': {}, for which seqs has the modify residue type or smiles.
 """
 import collections
 import copy
+import gzip
 import os
 import json
 import sys
 import subprocess
 import tempfile
 import itertools
-sys.path.append('../')
 import rdkit
 from rdkit import Chem
 from rdkit.Chem import AllChem
@@ -52,9 +52,7 @@
     3: 'Unknown error.'
 }
 
-OBABEL_BIN = os.getenv('OBABEL_BIN')
-if not os.path.exists(OBABEL_BIN):
-    raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')
+
 
 
 def read_json(path):
@@ -144,6 +142,11 @@ def smiles_toMol_obabel(smiles):
     """
         generate mol from smiles using obabel;
     """    
+    
+    OBABEL_BIN = os.getenv('OBABEL_BIN')
+    if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)):
+        raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')
+    
     with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
         print(f"[OBABEL] Temporary file created: {temp_file.name}")
         obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d"
diff --git a/apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py
similarity index 100%
rename from apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py
rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py
diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
similarity index 64%
rename from apps/protein_folding/helixfold3/inference.py
rename to apps/protein_folding/helixfold3/helixfold/inference.py
index 51cf6ec6..b3fbf745 100644
--- a/apps/protein_folding/helixfold3/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -16,7 +16,6 @@
 import re
 import os
 import copy
-import argparse
 import random
 import paddle
 import json
@@ -25,6 +24,11 @@
 import shutil
 import logging
 import numpy as np
+import shutil
+
+from omegaconf import DictConfig
+import hydra
+
 from helixfold.common import all_atom_pdb_save
 from helixfold.model import config, utils
 from helixfold.data import pipeline_parallel as pipeline
@@ -34,12 +38,14 @@
 from helixfold.data.utils import atom_level_keys, map_to_continuous_indices
 from helixfold.data.tools import hmmsearch
 from helixfold.data import templates
-from utils.utils import get_custom_amp_list
-from utils.model import RunModel
-from utils.misc import set_logging_level
+from helixfold.utils.utils import get_custom_amp_list
+from helixfold.utils.model import RunModel
+from helixfold.utils.misc import set_logging_level
 from typing import Dict
-from infer_scripts import feature_processing_aa, preprocess
-from infer_scripts.tools import mmcif_writer
+from helixfold.infer_scripts import feature_processing_aa, preprocess
+from helixfold.infer_scripts.tools import mmcif_writer
+
+script_path=os.path.dirname(__file__)
 
 ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP
 INVERSE_ALLOWED_LIGAND_BONDS_TYPE_MAP = {
@@ -105,45 +111,57 @@ def convert_to_json_compatible(obj):
         return [convert_to_json_compatible(i) for i in obj]
     else:
         return obj
-    
-def get_msa_templates_pipeline(args) -> Dict:
-    use_precomputed_msas = True # FLAGS.use_precomputed_msas
+
+def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str:
+    """Helper function to resolve the binary path."""
+    if cfg_path and os.path.isfile(cfg_path):
+        return cfg_path
+
+    if cfg_val:=shutil.which(default_binary_name):
+        logging.warning(f'Using resolved {default_binary_name}: {cfg_val}')
+        return cfg_val
+
+    raise FileNotFoundError(f"Could not find a proper binary path for {default_binary_name}: {cfg_path}.")
+
+def get_msa_templates_pipeline(cfg: DictConfig) -> Dict:
+    use_precomputed_msas = True  # Assuming this is a constant or should be set globally
+
     template_searcher = hmmsearch.Hmmsearch(
-        binary_path=args.hmmsearch_binary_path,
-        hmmbuild_binary_path=args.hmmbuild_binary_path,
-        database_path=args.pdb_seqres_database_path)
+        binary_path=resolve_bin_path(cfg.bin.hmmsearch, 'hmmsearch'),
+        hmmbuild_binary_path=resolve_bin_path(cfg.bin.hmmbuild, 'hmmbuild'),
+        database_path=cfg.db.pdb_seqres)
 
     template_featurizer = templates.HmmsearchHitFeaturizer(
-        mmcif_dir=args.template_mmcif_dir,
-        max_template_date=args.max_template_date,
+        mmcif_dir=cfg.template.mmcif_dir,
+        max_template_date=cfg.template.max_date,
         max_hits=MAX_TEMPLATE_HITS,
-        kalign_binary_path=args.kalign_binary_path,
+        kalign_binary_path=resolve_bin_path(cfg.bin.kalign, 'kalign'),
         release_dates_path=None,
-        obsolete_pdbs_path=args.obsolete_pdbs_path)
+        obsolete_pdbs_path=cfg.template.obsolete_pdbs)
 
     monomer_data_pipeline = pipeline.DataPipeline(
-        jackhmmer_binary_path=args.jackhmmer_binary_path,
-        hhblits_binary_path=args.hhblits_binary_path,
-        hhsearch_binary_path=args.hhsearch_binary_path,
-        uniref90_database_path=args.uniref90_database_path,
-        mgnify_database_path=args.mgnify_database_path,
-        bfd_database_path=args.bfd_database_path,
-        uniclust30_database_path=args.uniclust30_database_path,
-        small_bfd_database_path=args.small_bfd_database_path ,
+        jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'),
+        hhblits_binary_path=resolve_bin_path(cfg.bin.hhblits, 'hhblits'),
+        hhsearch_binary_path=resolve_bin_path(cfg.bin.hhsearch, 'hhsearch'),
+        uniref90_database_path=cfg.db.uniref90,
+        mgnify_database_path=cfg.db.mgnify,
+        bfd_database_path=cfg.db.bfd,
+        uniclust30_database_path=cfg.db.uniclust30,
+        small_bfd_database_path=cfg.db.small_bfd,
         template_searcher=template_searcher,
         template_featurizer=template_featurizer,
-        use_small_bfd=args.use_small_bfd,
+        use_small_bfd=cfg.use_small_bfd,
         use_precomputed_msas=use_precomputed_msas)
 
     prot_data_pipeline = pipeline_multimer.DataPipeline(
         monomer_data_pipeline=monomer_data_pipeline,
-        jackhmmer_binary_path=args.jackhmmer_binary_path,
-        uniprot_database_path=args.uniprot_database_path,
+        jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'),
+        uniprot_database_path=cfg.db.uniprot,
         use_precomputed_msas=use_precomputed_msas)
 
     rna_monomer_data_pipeline = pipeline_rna.RNADataPipeline(
-      hmmer_binary_path=args.nhmmer_binary_path,
-      rfam_database_path=args.rfam_database_path,
+      hmmer_binary_path=resolve_bin_path(cfg.bin.nhmmer, 'nhmmer'),
+      rfam_database_path=cfg.db.rfam,
       rnacentral_database_path=None,
       nt_database_path=None,     
       species_identifer_map_path=None,
@@ -156,7 +174,6 @@ def get_msa_templates_pipeline(args) -> Dict:
         'protein': prot_data_pipeline,
         'rna': rna_data_pipeline
     }
-
 def ranking_all_predictions(output_dirs):
     ranking_score_path_map = {}
     for outpath in output_dirs:
@@ -176,27 +193,29 @@ def ranking_all_predictions(output_dirs):
         rank_id += 1
 
 @paddle.no_grad()
-def eval(args, model, batch):
-    """evaluate a given dataset"""
+def eval(cfg: DictConfig, model:RunModel, batch):
+    """Evaluate a given dataset"""
     model.eval()       
         
-    # inference
+    # Inference
     def _forward_with_precision(batch):
-        if args.precision == "bf16" or args.bf16_infer:
+        precision=cfg.precision
+        if precision not in ('bf16','fp32',):
+            raise ValueError("Please choose precision from bf16 and fp32!")
+
+        if cfg.precision == "bf16" or cfg.bf16_infer:
             black_list, white_list = get_custom_amp_list()
             with paddle.amp.auto_cast(enable=True,
-                                        custom_white_list=white_list, 
-                                        custom_black_list=black_list, 
-                                        level=args.amp_level, 
-                                        dtype='bfloat16'):
+                                      custom_white_list=white_list, 
+                                      custom_black_list=black_list, 
+                                      level=cfg.amp_level, 
+                                      dtype='bfloat16'):
                 return model(batch, compute_loss=False)
-        elif args.precision == "fp32":
-            return model(batch, compute_loss=False)
-        else:
-            raise ValueError("Please choose precision from bf16 and fp32! ")
+
+        return model(batch, compute_loss=False)
         
     res = _forward_with_precision(batch)
-    logger.info(f"Inference Succeeds...\n")
+    logger.info("Inference Succeeds...\n")
     return res
 
 
@@ -430,52 +449,55 @@ def split_prediction(pred, rank):
     return prediction
 
 
-def main(args):
-    set_logging_level(args.logging_level)
+@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold')
+def main(cfg: DictConfig):
+    set_logging_level(cfg.logging_level)
 
     """main function"""
     new_einsum = os.getenv("FLAGS_new_einsum", True)
     print(f'>>> PaddlePaddle commit: {paddle.version.commit}')
     print(f'>>> FLAGS_new_einsum: {new_einsum}')
-    print(f'>>> args:\n{args}')
+    print(f'>>> config:\n{cfg}')
 
-    all_entitys = preprocess_json_entity(args.input_json, args.output_dir)
+    all_entitys = preprocess_json_entity(cfg.input, cfg.output)
     ## check maxit binary path
-    if args.maxit_binary is not None:
-        assert os.path.exists(args.maxit_binary), \
-            f"The maxit binary path {args.maxit_binary} does not exists."
+    maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit')
+    
+    RCSBROOT=os.path.dirname(maxit_binary)
+    os.environ['RCSBROOT']=RCSBROOT
 
+    ## check obabel
+    obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel')
+    os.environ['OBABEL_BIN']=obabel_bin
 
-    ### set seed for reproduce experiment results
-    seed = args.seed
+    ### Set seed for reproducibility
+    seed = cfg.seed
     if seed is None:
         seed = np.random.randint(10000000)
     else:
-        logger.warning('seed is only used for reproduction')
+        logger.warning('Seed is only used for reproduction')
     init_seed(seed)
 
-
-    use_small_bfd = args.preset == 'reduced_dbs'
-    setattr(args, 'use_small_bfd', use_small_bfd)
+    use_small_bfd = cfg.preset.preset == 'reduced_dbs'
+    setattr(cfg, 'use_small_bfd', use_small_bfd)
     if use_small_bfd:
-        assert args.small_bfd_database_path is not None
+        assert cfg.db.small_bfd is not None
     else:
-        assert args.bfd_database_path is not None
-        assert args.uniclust30_database_path is not None
+        assert cfg.db.bfd is not None
+        assert cfg.db.uniclust30 is not None
 
     logger.info('Getting MSA/Template Pipelines...')
     msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args)
         
-
-    ### create model
-    model_config = config.model_config(args.model_name)
-    print(f'>>> model_config:\n{model_config}')
+    ### Create model
+    model_config = config.model_config(cfg.job_id)
+    #print(f'>>> model_config:\n{model_config}')
 
     model = RunModel(model_config)
 
-    if (not args.init_model is None) and (not args.init_model == ""):
-        print(f"Load pretrain model from {args.init_model}")
-        pd_params = paddle.load(args.init_model)
+    if (not cfg.weight_path is None) and (cfg.weight_path != ""):
+        print(f"Load pretrain model from {cfg.weight_path}")
+        pd_params = paddle.load(cfg.weight_path)
         
         has_opt = 'optimizer' in pd_params
         if has_opt:
@@ -483,42 +505,46 @@ def main(args):
         else:
             model.helixfold.set_state_dict(pd_params)
     
-    if args.precision == "bf16" and args.amp_level == "O2":
+    if cfg.precision == "bf16" and cfg.amp_level == "O2":
         raise NotImplementedError("bf16 O2 is not supported yet.")
 
     print(f"============ Data Loading ============")
-    job_base = pathlib.Path(args.input_json).stem
-    output_dir_base = pathlib.Path(args.output_dir).joinpath(job_base)
+    job_base = pathlib.Path(cfg.input).stem
+    output_dir_base = pathlib.Path(cfg.output).joinpath(job_base)
     msa_output_dir = output_dir_base.joinpath('msas')
     msa_output_dir.mkdir(parents=True, exist_ok=True)
 
     features_pkl = output_dir_base.joinpath('final_features.pkl')
-    feature_dict = feature_processing_aa.process_input_json(
-                    all_entitys, 
-                    ccd_preprocessed_path=args.ccd_preprocessed_path,
-                    msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict,
-                    msa_output_dir=msa_output_dir)
+    if features_pkl.exists():
+        with open(features_pkl, 'rb') as f:
+            feature_dict = pickle.load(f)
+    else:
+        feature_dict = feature_processing_aa.process_input_json(
+                        all_entitys, 
+                        ccd_preprocessed_path=cfg.db.ccd_preprocessed,
+                        msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict,
+                        msa_output_dir=msa_output_dir)
 
-    # save features
-    with open(features_pkl, 'wb') as f:
-        pickle.dump(feature_dict, f, protocol=4)
+        # save features
+        with open(features_pkl, 'wb') as f:
+            pickle.dump(feature_dict, f, protocol=4)
 
     feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True)
     feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True)
     
     print(f"============ Start Inference ============")
     
-    infer_times = args.infer_times
-    if args.diff_batch_size > 0:
-        model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size
+    infer_times = cfg.infer_times
+    if cfg.diff_batch_size > 0:
+        model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size
     diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size 
     logger.info(f'Inference {infer_times} Times...')
-    logger.info(f" diffusion batch size {diff_batch_size}...\n")
+    logger.info(f"Diffusion batch size {diff_batch_size}...\n")
     all_pred_path = []
     for infer_id in range(infer_times):
         
         logger.info(f'Start {infer_id}-th inference...\n')
-        prediction = eval(args, model, feature_dict)
+        prediction = eval(cfg, model, feature_dict)
         
         # save result
         prediction = split_prediction(prediction, diff_batch_size)
@@ -530,7 +556,7 @@ def main(args):
                         feature_dict=feature_dict,
                         prediction=prediction[rank_id],
                         output_dir=output_dir, 
-                        maxit_bin=args.maxit_binary)
+                        maxit_bin=cfg.other.maxit_binary)
             all_pred_path.append(output_dir)
     
     # final ranking
@@ -538,100 +564,5 @@ def main(args):
     ranking_all_predictions(all_pred_path)
     print(f'============ Inference finished ! ============')
 
-
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bf16_infer", action='store_true', default=False)
-    parser.add_argument("--seed", type=int, default=None, help="set seed for reproduce experiment results, None is do not set seed")
-    parser.add_argument("--logging_level", type=str, default="DEBUG", help="NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL")
-    parser.add_argument("--model_name", type=str, help='used to choose model config')
-    parser.add_argument("--init_model", type=str, default='')
-    parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32')
-    parser.add_argument("--amp_level", type=str, default='O1')
-    parser.add_argument("--infer_times", type=int, default=1)
-    parser.add_argument("--diff_batch_size", type=int, default=-1)
-    parser.add_argument('--input_json', type=str,
-                        default=None, required=True,
-                        help='Paths to json file, each containing '
-                        'entity information including sequence, smiles or CCD, copies etc.')
-    parser.add_argument('--output_dir', type=str,
-                        default=None, required=True,
-                        help='Path to a directory that will store results.')
-    parser.add_argument('--ccd_preprocessed_path', type=str,
-                        default=None, required=True,
-                        help='Path to CCD preprocessed files.')
-    parser.add_argument('--jackhmmer_binary_path', type=str,
-                        default='/usr/bin/jackhmmer',
-                        help='Path to the JackHMMER executable.')
-    parser.add_argument('--hhblits_binary_path', type=str,
-                        default='/usr/bin/hhblits',
-                        help='Path to the HHblits executable.')
-    parser.add_argument('--hhsearch_binary_path', type=str,
-                        default='/usr/bin/hhsearch',
-                        help='Path to the HHsearch executable.')
-    parser.add_argument('--kalign_binary_path', type=str,
-                        default='/usr/bin/kalign',
-                        help='Path to the Kalign executable.')
-    parser.add_argument('--hmmsearch_binary_path', type=str,
-                        default='/usr/bin/hmmsearch',
-                        help='Path to the hmmsearch executable.')
-    parser.add_argument('--hmmbuild_binary_path', type=str,
-                        default='/usr/bin/hmmbuild',
-                        help='Path to the hmmbuild executable.')
-
-    # binary path of the tool for RNA MSA searching
-    parser.add_argument('--nhmmer_binary_path', type=str,
-                        default='/usr/bin/nhmmer',
-                        help='Path to the nhmmer executable.')
-    
-    parser.add_argument('--uniprot_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the Uniprot database for use '
-                        'by JackHMMER.')
-    parser.add_argument('--pdb_seqres_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the PDB '
-                        'seqres database for use by hmmsearch.')
-    parser.add_argument('--uniref90_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the Uniref90 database for use '
-                        'by JackHMMER.')
-    parser.add_argument('--mgnify_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the MGnify database for use by '
-                        'JackHMMER.')
-    parser.add_argument('--bfd_database_path', type=str, default=None,
-                        help='Path to the BFD database for use by HHblits.')
-    parser.add_argument('--small_bfd_database_path', type=str, default=None,
-                        help='Path to the small version of BFD used '
-                        'with the "reduced_dbs" preset.')
-    parser.add_argument('--uniclust30_database_path', type=str, default=None,
-                        help='Path to the Uniclust30 database for use '
-                        'by HHblits.')
-    # RNA MSA searching databases
-    parser.add_argument('--rfam_database_path', type=str,
-                        default=None, required=True,
-                        help='Path to the Rfam database for RNA MSA searching.')
-    parser.add_argument('--template_mmcif_dir', type=str,
-                        default=None, required=True,
-                        help='Path to a directory with template mmCIF '
-                        'structures, each named <pdb_id>.cif')
-    parser.add_argument('--max_template_date', type=str,
-                        default=None, required=True,
-                        help='Maximum template release date to consider. '
-                        'Important if folding historical test sets.')
-    parser.add_argument('--obsolete_pdbs_path', type=str,
-                        default=None, required=True,
-                        help='Path to file containing a mapping from '
-                        'obsolete PDB IDs to the PDB IDs of their '
-                        'replacements.')
-    parser.add_argument('--preset',
-                        default='full_dbs', required=False,
-                        choices=['reduced_dbs', 'full_dbs'],
-                        help='Choose preset model configuration - '
-                        'no ensembling and smaller genetic database '
-                        'config (reduced_dbs), no ensembling and full '
-                        'genetic database config  (full_dbs)')
-    parser.add_argument('--maxit_binary', type=str, default=None)
-    args = parser.parse_args()
-    main(args)
+    main()
diff --git a/apps/protein_folding/helixfold3/utils/__init__.py b/apps/protein_folding/helixfold3/helixfold/utils/__init__.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/__init__.py
rename to apps/protein_folding/helixfold3/helixfold/utils/__init__.py
diff --git a/apps/protein_folding/helixfold3/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/misc.py
rename to apps/protein_folding/helixfold3/helixfold/utils/misc.py
diff --git a/apps/protein_folding/helixfold3/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/model.py
rename to apps/protein_folding/helixfold3/helixfold/utils/model.py
diff --git a/apps/protein_folding/helixfold3/utils/utils.py b/apps/protein_folding/helixfold3/helixfold/utils/utils.py
similarity index 100%
rename from apps/protein_folding/helixfold3/utils/utils.py
rename to apps/protein_folding/helixfold3/helixfold/utils/utils.py
diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml
new file mode 100644
index 00000000..bc988ca9
--- /dev/null
+++ b/apps/protein_folding/helixfold3/pyproject.toml
@@ -0,0 +1,48 @@
+[build-system]
+requires = ["poetry-core>=1.0.0,<2.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "helixfold"
+version = "3.0.0"
+description = "Code for helixfold v3"
+authors = ["Name <email@address>"]
+
+readme = "README.md"
+license = "MIT"
+repository = "https://github.com/PaddlePaddle/PaddleHelix/blob/dev/apps/protein_folding/helixfold3"
+classifiers = [
+    "Topic :: Scientific/Engineering :: Biochemistry",
+    "Topic :: Scientific/Engineering :: Protein Engineering"
+]
+
+
+packages = [
+    { include = "helixfold" },
+    { include = "helixfold/*.py" },
+]
+
+
+[tool.poetry.dependencies]
+python = "^3.8" 
+
+absl-py = "0.13.0"
+biopython = "1.79"
+chex = "0.0.7"
+dm-haiku = "0.0.4"
+dm-tree = "0.1.6"
+docker = "5.0.0"
+immutabledict = "2.0.0"
+jax = "0.2.14"
+ml-collections = "0.1.0"
+pandas = "1.3.4"
+scipy = "1.9.0"
+rdkit-pypi = "2022.9.5"
+posebusters = "*" 
+hydra-core= "^1.3.2"
+omegaconf = "^2.3.0"
+
+
+
+[tool.poetry.scripts]
+helixfold = 'helixfold.inference:main'
diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh
new file mode 100644
index 00000000..30f008d6
--- /dev/null
+++ b/apps/protein_folding/helixfold3/setup_env.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+ENV_NAME='helixfold'
+CUDA=12.0
+
+# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit
+
+# Install py env
+conda create -n ${ENV_NAME} -y -c conda-forge  pip  python=3.9;
+source activate ${ENV_NAME}
+conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
+
+conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
+conda install -y -c conda-forge openbabel
+
+python -m pip install --upgrade 'pip<24';pip install .  --no-cache-dir
+
+pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl

From a5925e0234e88bb8b57042e83402dc6b59cee3b3 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 10:52:13 +0800
Subject: [PATCH 2/6] use reduced bfd

---
 .../helixfold3/helixfold/config/helixfold.yaml              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index fd70ada0..60e8076d 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -29,7 +29,7 @@ bin:
   hmmsearch: null  # Corresponds to --hmmsearch_binary_path
   hmmbuild: null  # Corresponds to --hmmbuild_binary_path
   nhmmer: null  # Corresponds to --nhmmer_binary_path
-  obabel: null
+  obabel: null  # Inject to env as OBABEL_BIN
 
 # Database paths
 db:
@@ -38,7 +38,7 @@ db:
   uniref90: /mnt/db/uniref90/uniref90.fasta  # Corresponds to --uniref90_database_path, required field
   mgnify: /mnt/db/mgnify/mgy_clusters.fa  # Corresponds to --mgnify_database_path, required field
   bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt  # Corresponds to --bfd_database_path
-  small_bfd: null  # Corresponds to --small_bfd_database_path
+  small_bfd: /mnt/db/reduced_bfd/bfd-first_non_consensus_sequences.fasta  # Corresponds to --small_bfd_database_path
   uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02  # Corresponds to --uniclust30_database_path
   rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta  # Corresponds to --rfam_database_path, required field
   ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz  # Corresponds to --ccd_preprocessed_path, required field
@@ -51,7 +51,7 @@ template:
 
 # Preset configuration
 preset:
-  preset: full_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
+  preset: reduced_dbs  # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs']
 
 # Other configurations
 other:

From ab1648aad453e64203c69d538536ddc6af8d7bee Mon Sep 17 00:00:00 2001
From: Ryan Garcia <47666442+RyanGarciaLI@users.noreply.github.com>
Date: Thu, 15 Aug 2024 21:20:39 +0800
Subject: [PATCH 3/6] disable no MSA mode (#312)


From eb15ef707d162b80959530daeaaf681934b95486 Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:37:57 +0800
Subject: [PATCH 4/6] feat: model config overriden

add override flag

fix: enable mutiple config overrides

feat: model config overriden

fix: imports

doc: merge to readme

fix:config

add configs
---
 apps/protein_folding/helixfold3/README.md     | 99 +++++++++----------
 .../helixfold/config/helixfold.yaml           | 17 +++-
 .../helixfold3/helixfold/inference.py         | 12 ++-
 .../helixfold3/helixfold/model/config.py      | 49 ++++++---
 .../helixfold3/requirements.txt               | 13 ---
 apps/protein_folding/helixfold3/run_infer.sh  | 39 --------
 apps/protein_folding/helixfold3/setup_env.sh  | 18 ----
 7 files changed, 102 insertions(+), 145 deletions(-)
 delete mode 100644 apps/protein_folding/helixfold3/requirements.txt
 delete mode 100644 apps/protein_folding/helixfold3/run_infer.sh
 delete mode 100644 apps/protein_folding/helixfold3/setup_env.sh

diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
index 12ed7041..7c1ba474 100644
--- a/apps/protein_folding/helixfold3/README.md
+++ b/apps/protein_folding/helixfold3/README.md
@@ -44,17 +44,26 @@ Locate to the directory of `helixfold` then run:
 ```bash
 # Install py env
 conda create -n helixfold -c conda-forge python=3.9
-conda install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -n helixfold
-conda install -y -c conda-forge openbabel -n helixfold
 
 # activate the conda environment
 conda activate helixfold
 
+# adjust these version numbers as your situation
+conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
+conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
+conda install -y -c conda-forge openbabel
+
 # install paddlepaddle
-python3 -m pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
 # or lower version: https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl
 
-python3 -m pip install -r requirements.txt
+# downgrade pip
+pip install --upgrade 'pip<24'
+
+# edit configuration file at `/helixfold/config/helixfold.yaml` to set your databases and binaries correctly
+
+# install HF3 as a python library
+pip install .  --no-cache-dir
 ```
 
 Note: If you have a different version of python3 and cuda, please refer to [here](https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html) for the compatible PaddlePaddle `dev` package.
@@ -125,58 +134,40 @@ sh run_infer.sh
 ```
 
 The script is as follows,
-```bash
-#!/bin/bash
-
-PYTHON_BIN="PATH/TO/YOUR/PYTHON"
-ENV_BIN="PATH/TO/YOUR/ENV"
-MAXIT_SRC="PATH/TO/MAXIT/SRC"
-DATA_DIR="PATH/TO/DATA"
-export OBABEL_BIN="PATH/TO/OBABEL/BIN"
-export PATH="$MAXIT_BIN/bin:$PATH"
-
-CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
-    --maxit_binary "$MAXIT_SRC/bin/maxit" \
-    --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \
-	--hhblits_binary_path "$ENV_BIN/hhblits" \
-	--hhsearch_binary_path "$ENV_BIN/hhsearch" \
-	--kalign_binary_path "$ENV_BIN/kalign" \
-	--hmmsearch_binary_path "$ENV_BIN/hmmsearch" \
-	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
-    --nhmmer_binary_path "$ENV_BIN/nhmmer" \
-    --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
-    --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
-    --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
-    --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
-    --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
-    --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \
-    --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \
-    --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \
-    --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
-    --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
-    --max_template_date=2020-05-14 \
-    --input_json data/demo_protein_ligand.json \
-    --output_dir ./output \
-    --model_name allatom_demo \
-    --init_model ./init_models/checkpoints.pdparams \
-    --infer_times 3 \
-    --precision "fp32"
+##### Run from default config
+```shell
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
+helixfold \
+    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json \
+    output=. CONFIG_DIFFS.preset=allatom_demo
 ```
+
+##### Run with customized configuration dir and file(`./myfold.yaml`, for example):
+```shell
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \
+helixfold --config-dir=. --config-name=myfold \
+    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json \
+    output=. CONFIG_DIFFS.preset=allatom_demo
+```
+
+##### Run with additional configuration term 
+```shell
+LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH \
+helixfold \
+    input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy.json \
+    output=. \
+    CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \
+    CONFIG_DIFFS.model.global_config.subbatch_size=192
+```
+
 The descriptions of the above script are as follows:
-* Replace `MAXIT_SRC` with your installed `maxit`'s root path.
-* Replace `DATA_DIR` with your downloaded data path.
-* Replace `OBABEL_BIN` with your installed `openbabel` path.
-* Replace `ENV_BIN` with your conda virtual environment or any environment where `hhblits`, `hmmsearch` and other dependencies have been installed.
-* `--preset` - Set `'reduced_dbs'` to use small bfd or `'full_dbs'` to use full bfd.
-* `--*_database_path` - Path to datasets you have downloaded.
-* `--input_json` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference.
-* `--output_dir` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
-* `--model_name` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
-* `--infer_time` - The number of inferences executed by model for single input. In each inference, the model will infer `5` times (`diff_batch_size`) for the same input by default. This hyperparameter can be changed by `model.head.diffusion_module.test_diff_batch_size` within `./helixfold/model/config.py`
-* `--precision` - Either `bf16` or `fp32`. Please check if your machine can support `bf16` or not beforing changing it. For example, `bf16` is supported by A100 and H100 or higher version while V100 only supports `fp32`.
+* `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.`
+* `config-dir` - The directory that contains the alterative configuration file you would like to use.
+* `config-name` - The name of the configuration file you would like to use.
+* `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference.
+* `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path.
+* `CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`.
+* `CONFIG_DIFFS.*` - Override model any configuration in `CONFIG_ALLATOM`.
 
 ### Understanding Model Output
 
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 60e8076d..4900fdca 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -13,11 +13,13 @@ amp_level: O1  # Corresponds to --amp_level
 infer_times: 1  # Corresponds to --infer_times
 diff_batch_size: -1  # Corresponds to --diff_batch_size
 use_small_bfd: false # Corresponds to --use_small_bfd
+msa_only: false # Only process msa
 
 # File paths
 
 input: null  # Corresponds to --input_json, required field
 output: null  # Corresponds to --output_dir, required field
+override: false # Set true to override existing msa output directory
 
 
 # Binary tool paths, leave them as null to find proper ones under PATH or conda bin path
@@ -55,5 +57,16 @@ preset:
 
 # Other configurations
 other:
-  maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
-  no_msa_templ_feats: false  # Corresponds to --no_msa_templ_feats
+  maxit_binary: /mnt/data/software/maxit/maxit-v11.100-prod-src/bin/maxit  # Corresponds to --maxit_binary
+
+
+# CONFIG_DIFFS for advanced configuration
+CONFIG_DIFFS:
+  preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1']
+  # model:
+    # global_config:
+    #   subbatch_size: 96 # model.global_config.subbatch_size
+    # num_recycle: 3 # model.num_recycle
+    # heads:
+    #   confidence_head:
+    #     weight: 0.0 # model.heads.confidence_head.weight
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index b3fbf745..f67e18a5 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -36,15 +36,16 @@
 from helixfold.data import pipeline_rna_parallel as pipeline_rna
 from helixfold.data import pipeline_rna_multimer
 from helixfold.data.utils import atom_level_keys, map_to_continuous_indices
+from helixfold.utils.model import RunModel
 from helixfold.data.tools import hmmsearch
 from helixfold.data import templates
 from helixfold.utils.utils import get_custom_amp_list
-from helixfold.utils.model import RunModel
 from helixfold.utils.misc import set_logging_level
 from typing import Dict
 from helixfold.infer_scripts import feature_processing_aa, preprocess
 from helixfold.infer_scripts.tools import mmcif_writer
 
+
 script_path=os.path.dirname(__file__)
 
 ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP
@@ -487,11 +488,11 @@ def main(cfg: DictConfig):
         assert cfg.db.uniclust30 is not None
 
     logger.info('Getting MSA/Template Pipelines...')
-    msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args)
+    msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg)
         
     ### Create model
-    model_config = config.model_config(cfg.job_id)
-    #print(f'>>> model_config:\n{model_config}')
+    model_config = config.model_config(cfg.CONFIG_DIFFS)
+    logging.warning(f'>>> Model config: \n{model_config}\n\n')
 
     model = RunModel(model_config)
 
@@ -515,8 +516,9 @@ def main(cfg: DictConfig):
     msa_output_dir.mkdir(parents=True, exist_ok=True)
 
     features_pkl = output_dir_base.joinpath('final_features.pkl')
-    if features_pkl.exists():
+    if features_pkl.exists() and not cfg.override:
         with open(features_pkl, 'rb') as f:
+            logging.info(f'Load features from precomputed {features_pkl}')
             feature_dict = pickle.load(f)
     else:
         feature_dict = feature_processing_aa.process_input_json(
diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py
index 6da8566a..f9dbbf1d 100644
--- a/apps/protein_folding/helixfold3/helixfold/model/config.py
+++ b/apps/protein_folding/helixfold3/helixfold/model/config.py
@@ -15,7 +15,8 @@
 """Model config."""
 
 import copy
-import ml_collections
+from typing import Any, Union
+from omegaconf import DictConfig
 
 
 NUM_RES = 'num residues placeholder'
@@ -24,27 +25,47 @@
 NUM_TEMPLATES = 'num templates placeholder'
 
 
-def model_config(name: str) -> ml_collections.ConfigDict:
+def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> DictConfig:
   """Get the ConfigDict of a model."""
 
   cfg = copy.deepcopy(CONFIG_ALLATOM)
-  if name in CONFIG_DIFFS:
-    cfg.update_from_flattened_dict(CONFIG_DIFFS[name])
+  if config_diffs is None or config_diffs=='':
+    # early return if nothing is changed
+    return cfg
 
-  return cfg
+  if isinstance(config_diffs, DictConfig):
+    if 'preset' in config_diffs and (preset_name:=config_diffs['preset']) in CONFIG_DIFFS:
+      updated_config=CONFIG_DIFFS[preset_name]
+      cfg.merge_with_dotlist(updated_config)
+      print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}')
 
+    
+    # update from detailed configuration
+    if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM):
 
-CONFIG_DIFFS = {
-    'allatom_demo': {
-        'model.heads.confidence_head.weight': 0.01
-    },
-    'allatom_subbatch_64_recycle_1': {
-        'model.global_config.subbatch_size': 64,
-        'model.num_recycle': 1,
-    },
+      for root_kw in CONFIG_ALLATOM:
+        if root_kw not in config_diffs:
+          continue
+        cfg.merge_with(DictConfig({root_kw:config_diffs[root_kw]})) # merge to override
+        print(f'Updated config from `CONFIG_DIFFS`:{root_kw}: {config_diffs[root_kw]}')
+    
+    return cfg
+  
+  raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}')
+    
+
+# preset for runs
+CONFIG_DIFFS: dict[str, list[str]] = {
+    'allatom_demo': [
+      'model.heads.confidence_head.weight=0.01'
+      ],
+    'allatom_subbatch_64_recycle_1': [
+        'model.global_config.subbatch_size=64',
+        'model.num_recycle=1',
+    ]
 }
 
-CONFIG_ALLATOM = ml_collections.ConfigDict({
+CONFIG_ALLATOM = DictConfig({
   'data': {   
     'num_blocks': 5,    # for msa block deletion
     'randomize_num_blocks': True,
diff --git a/apps/protein_folding/helixfold3/requirements.txt b/apps/protein_folding/helixfold3/requirements.txt
deleted file mode 100644
index 660e43c1..00000000
--- a/apps/protein_folding/helixfold3/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-absl-py==0.13.0
-biopython==1.79
-chex==0.0.7
-dm-haiku==0.0.4
-dm-tree==0.1.6
-docker==5.0.0
-immutabledict==2.0.0
-jax==0.2.14
-ml-collections==0.1.0
-pandas==1.3.4
-scipy==1.9.0
-rdkit-pypi==2022.9.5 
-posebusters
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh
deleted file mode 100644
index 5b0644e5..00000000
--- a/apps/protein_folding/helixfold3/run_infer.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-PYTHON_BIN="/usr/bin/python3" # changes to your python
-ENV_BIN="/root/miniconda3/bin"  # change to your env
-MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT
-export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel
-DATA_DIR="./data"
-export PATH="$MAXIT_SRC/bin:$PATH"
-
-CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
-    --maxit_binary "$MAXIT_SRC/bin/maxit" \
-    --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \
-	--hhblits_binary_path "$ENV_BIN/hhblits" \
-	--hhsearch_binary_path "$ENV_BIN/hhsearch" \
-	--kalign_binary_path "$ENV_BIN/kalign" \
-	--hmmsearch_binary_path "$ENV_BIN/hmmsearch" \
-	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
-    --nhmmer_binary_path "$ENV_BIN/nhmmer" \
-    --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
-    --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
-    --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
-    --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
-    --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
-    --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \
-    --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \
-    --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \
-    --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
-    --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
-    --max_template_date=2020-05-14 \
-    --input_json data/demo_6zcy.json \
-    --output_dir ./output \
-    --model_name allatom_demo \
-    --init_model init_models/HelixFold3-240814.pdparams \
-    --infer_times 1 \
-    --diff_batch_size 1 \
-    --precision "fp32"
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh
deleted file mode 100644
index 30f008d6..00000000
--- a/apps/protein_folding/helixfold3/setup_env.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-ENV_NAME='helixfold'
-CUDA=12.0
-
-# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit
-
-# Install py env
-conda create -n ${ENV_NAME} -y -c conda-forge  pip  python=3.9;
-source activate ${ENV_NAME}
-conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia
-
-conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 
-conda install -y -c conda-forge openbabel
-
-python -m pip install --upgrade 'pip<24';pip install .  --no-cache-dir
-
-pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl

From fbb2e05a7ed8590e793473bc6659eb7996ca774d Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:27:45 +0800
Subject: [PATCH 5/6] chore: cpu only for msa only

---
 .../helixfold3/helixfold/inference.py                | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index f67e18a5..d176d057 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -454,6 +454,12 @@ def split_prediction(pred, rank):
 def main(cfg: DictConfig):
     set_logging_level(cfg.logging_level)
 
+    if cfg.msa_only == True:
+        logging.warning(f'Model inference will be skipped because MSA-only mode is required.')
+        logging.warning(f'Use CPU only')
+        paddle.device.set_device("cpu")
+        
+
     """main function"""
     new_einsum = os.getenv("FLAGS_new_einsum", True)
     print(f'>>> PaddlePaddle commit: {paddle.version.commit}')
@@ -505,6 +511,8 @@ def main(cfg: DictConfig):
             model.helixfold.set_state_dict(pd_params['model'])
         else:
             model.helixfold.set_state_dict(pd_params)
+
+    
     
     if cfg.precision == "bf16" and cfg.amp_level == "O2":
         raise NotImplementedError("bf16 O2 is not supported yet.")
@@ -531,6 +539,10 @@ def main(cfg: DictConfig):
         with open(features_pkl, 'wb') as f:
             pickle.dump(feature_dict, f, protocol=4)
 
+    if cfg.msa_only == True:
+        logging.warning(f'Model inference is skipped because MSA-only mode is required.')
+        exit()
+
     feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True)
     feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True)
     

From 62dbaa4c74c0b7c6025f3fbed7e0fdf312a556cf Mon Sep 17 00:00:00 2001
From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com>
Date: Sat, 17 Aug 2024 16:05:58 +0800
Subject: [PATCH 6/6] fix: maxit run with env

fix: maxit path

fix: maxit run with env
---
 .../helixfold/common/all_atom_pdb_save.py     | 29 ++++++++++++++++---
 .../helixfold/config/helixfold.yaml           |  1 -
 .../helixfold3/helixfold/inference.py         | 28 +++++++++---------
 3 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index 9c9f288e..92e7d225 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -21,6 +21,7 @@
 import paddle
 import itertools
 import os
+import subprocess
 
 FeatureDict = Mapping[str, np.ndarray]
 ModelOutput = Mapping[str, Any]  # Is a nested dict.
@@ -164,7 +165,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
     - maxit_binary: path to maxit_binary, use to convert pdb to cif
     - mmcif_path: path to save *.cif
   """
-  if os.path.isfile(maxit_binary):
+  if not os.path.isfile(maxit_binary):
     raise FileNotFoundError(
       f'maxit_binary: {maxit_binary} not exists. '
       f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html')
@@ -174,7 +175,27 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
 
   pdb_path = mmcif_path.replace('.cif', '.pdb')
   pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path)
-  msg = os.system(f'{maxit_binary} -i {pdb_path} -o 1 -output {mmcif_path}')
-  if msg != 0:
-    print(f'convert pdb to cif failed, error message: {msg}')
+
+  cmd=[maxit_binary,
+       '-i', pdb_path,
+       '-o', '1',
+       '-output', mmcif_path,
+       ]
+  
+  print('Launching subprocess "%s"', ' '.join(cmd))
+
+  process = subprocess.Popen(
+      cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
+
+
+  stdout, stderr = process.communicate()
+  retcode = process.wait()
+
+
+  if retcode:
+    # Logs have a 15k character limit, so log HHblits error line by line.
+    print('maxit failed. HHblits stderr begin:')
+    raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
+        stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
+
   return mmcif_path
\ No newline at end of file
diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
index 4900fdca..047ee386 100644
--- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
+++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml
@@ -6,7 +6,6 @@ defaults:
 bf16_infer: false  # Corresponds to --bf16_infer
 seed: null  # Corresponds to --seed
 logging_level: DEBUG  # Corresponds to --logging_level
-job_id: 'structure_prediction'  # Corresponds to --model_name
 weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams  # Corresponds to --init_model
 precision: fp32  # Corresponds to --precision
 amp_level: O1  # Corresponds to --amp_level
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
index d176d057..0531ead7 100644
--- a/apps/protein_folding/helixfold3/helixfold/inference.py
+++ b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -194,29 +194,27 @@ def ranking_all_predictions(output_dirs):
         rank_id += 1
 
 @paddle.no_grad()
-def eval(cfg: DictConfig, model:RunModel, batch):
-    """Evaluate a given dataset"""
+def eval(args, model, batch):
+    """evaluate a given dataset"""
     model.eval()       
         
-    # Inference
+    # inference
     def _forward_with_precision(batch):
-        precision=cfg.precision
-        if precision not in ('bf16','fp32',):
-            raise ValueError("Please choose precision from bf16 and fp32!")
-
-        if cfg.precision == "bf16" or cfg.bf16_infer:
+        if args.precision == "bf16" or args.bf16_infer:
             black_list, white_list = get_custom_amp_list()
             with paddle.amp.auto_cast(enable=True,
-                                      custom_white_list=white_list, 
-                                      custom_black_list=black_list, 
-                                      level=cfg.amp_level, 
-                                      dtype='bfloat16'):
+                                        custom_white_list=white_list, 
+                                        custom_black_list=black_list, 
+                                        level=args.amp_level, 
+                                        dtype='bfloat16'):
                 return model(batch, compute_loss=False)
-
-        return model(batch, compute_loss=False)
+        elif args.precision == "fp32":
+            return model(batch, compute_loss=False)
+        else:
+            raise ValueError("Please choose precision from bf16 and fp32! ")
         
     res = _forward_with_precision(batch)
-    logger.info("Inference Succeeds...\n")
+    logger.info(f"Inference Succeeds...\n")
     return res