From 6346752a654ece5b53cff8683c28b8c33b484a40 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:34:27 +0800 Subject: [PATCH 1/6] chore: hydra and pip --- .../helixfold/common/all_atom_pdb_save.py | 7 +- .../helixfold/config/helixfold.yaml | 59 ++++ .../infer_scripts/feature_processing_aa.py | 0 .../infer_scripts/preprocess.py | 13 +- .../infer_scripts/tools/mmcif_writer.py | 0 .../helixfold3/{ => helixfold}/inference.py | 289 +++++++----------- .../{ => helixfold}/utils/__init__.py | 0 .../helixfold3/{ => helixfold}/utils/misc.py | 0 .../helixfold3/{ => helixfold}/utils/model.py | 0 .../helixfold3/{ => helixfold}/utils/utils.py | 0 .../protein_folding/helixfold3/pyproject.toml | 48 +++ apps/protein_folding/helixfold3/setup_env.sh | 18 ++ 12 files changed, 248 insertions(+), 186 deletions(-) create mode 100644 apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/feature_processing_aa.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/preprocess.py (97%) rename apps/protein_folding/helixfold3/{ => helixfold}/infer_scripts/tools/mmcif_writer.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/inference.py (64%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/__init__.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/misc.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/model.py (100%) rename apps/protein_folding/helixfold3/{ => helixfold}/utils/utils.py (100%) create mode 100644 apps/protein_folding/helixfold3/pyproject.toml create mode 100644 apps/protein_folding/helixfold3/setup_env.sh diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index deb8e087..9c9f288e 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -164,10 +164,13 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], - maxit_binary: path to maxit_binary, use to convert pdb to cif - mmcif_path: path to save *.cif """ - assert maxit_binary is not None and os.path.exists(maxit_binary), ( + if os.path.isfile(maxit_binary): + raise FileNotFoundError( f'maxit_binary: {maxit_binary} not exists. ' f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') - assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}' + + if not mmcif_path.endswith('.cif'): + raise ValueError(f'mmcif_path should endswith .cif; got {mmcif_path}') pdb_path = mmcif_path.replace('.cif', '.pdb') pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml new file mode 100644 index 00000000..fd70ada0 --- /dev/null +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ + +# General configuration + +bf16_infer: false # Corresponds to --bf16_infer +seed: null # Corresponds to --seed +logging_level: DEBUG # Corresponds to --logging_level +job_id: 'structure_prediction' # Corresponds to --model_name +weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams # Corresponds to --init_model +precision: fp32 # Corresponds to --precision +amp_level: O1 # Corresponds to --amp_level +infer_times: 1 # Corresponds to --infer_times +diff_batch_size: -1 # Corresponds to --diff_batch_size +use_small_bfd: false # Corresponds to --use_small_bfd + +# File paths + +input: null # Corresponds to --input_json, required field +output: null # Corresponds to --output_dir, required field + + +# Binary tool paths, leave them as null to find proper ones under PATH or conda bin path +bin: + jackhmmer: null # Corresponds to --jackhmmer_binary_path + hhblits: null # Corresponds to --hhblits_binary_path + hhsearch: null # Corresponds to --hhsearch_binary_path + kalign: null # Corresponds to --kalign_binary_path + hmmsearch: null # Corresponds to --hmmsearch_binary_path + hmmbuild: null # Corresponds to --hmmbuild_binary_path + nhmmer: null # Corresponds to --nhmmer_binary_path + obabel: null + +# Database paths +db: + uniprot: /mnt/db/uniprot/uniprot.fasta # Corresponds to --uniprot_database_path, required field + pdb_seqres: /mnt/db/pdb_seqres/pdb_seqres.txt # Corresponds to --pdb_seqres_database_path, required field + uniref90: /mnt/db/uniref90/uniref90.fasta # Corresponds to --uniref90_database_path, required field + mgnify: /mnt/db/mgnify/mgy_clusters.fa # Corresponds to --mgnify_database_path, required field + bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt # Corresponds to --bfd_database_path + small_bfd: null # Corresponds to --small_bfd_database_path + uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02 # Corresponds to --uniclust30_database_path + rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta # Corresponds to --rfam_database_path, required field + ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz # Corresponds to --ccd_preprocessed_path, required field + +# Template and PDB information +template: + mmcif_dir: /mnt/db/pdb_mmcif/mmcif_files # Corresponds to --template_mmcif_dir, required field + max_date: '2023-03-15' # Corresponds to --max_template_date, required field + obsolete_pdbs: /mnt/db/pdb_mmcif/obsolete.dat # Corresponds to --obsolete_pdbs_path, required field + +# Preset configuration +preset: + preset: full_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] + +# Other configurations +other: + maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary + no_msa_templ_feats: false # Corresponds to --no_msa_templ_feats diff --git a/apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py similarity index 100% rename from apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py diff --git a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py similarity index 97% rename from apps/protein_folding/helixfold3/infer_scripts/preprocess.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 41cd44ac..eb8eb14f 100644 --- a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -5,17 +5,17 @@ 'seqs': ccd_seqs, 'msa_seqs': msa_seqs, 'count': count, - 'extra_mol_infos': {}, for which seqs has the modify residue type or smiles. + 'extra_mol_infos': {}, for which seqs has the modify residue type or smiles. """ import collections import copy +import gzip import os import json import sys import subprocess import tempfile import itertools -sys.path.append('../') import rdkit from rdkit import Chem from rdkit.Chem import AllChem @@ -52,9 +52,7 @@ 3: 'Unknown error.' } -OBABEL_BIN = os.getenv('OBABEL_BIN') -if not os.path.exists(OBABEL_BIN): - raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') + def read_json(path): @@ -144,6 +142,11 @@ def smiles_toMol_obabel(smiles): """ generate mol from smiles using obabel; """ + + OBABEL_BIN = os.getenv('OBABEL_BIN') + if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)): + raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') + with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: print(f"[OBABEL] Temporary file created: {temp_file.name}") obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d" diff --git a/apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py similarity index 100% rename from apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py similarity index 64% rename from apps/protein_folding/helixfold3/inference.py rename to apps/protein_folding/helixfold3/helixfold/inference.py index 51cf6ec6..b3fbf745 100644 --- a/apps/protein_folding/helixfold3/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -16,7 +16,6 @@ import re import os import copy -import argparse import random import paddle import json @@ -25,6 +24,11 @@ import shutil import logging import numpy as np +import shutil + +from omegaconf import DictConfig +import hydra + from helixfold.common import all_atom_pdb_save from helixfold.model import config, utils from helixfold.data import pipeline_parallel as pipeline @@ -34,12 +38,14 @@ from helixfold.data.utils import atom_level_keys, map_to_continuous_indices from helixfold.data.tools import hmmsearch from helixfold.data import templates -from utils.utils import get_custom_amp_list -from utils.model import RunModel -from utils.misc import set_logging_level +from helixfold.utils.utils import get_custom_amp_list +from helixfold.utils.model import RunModel +from helixfold.utils.misc import set_logging_level from typing import Dict -from infer_scripts import feature_processing_aa, preprocess -from infer_scripts.tools import mmcif_writer +from helixfold.infer_scripts import feature_processing_aa, preprocess +from helixfold.infer_scripts.tools import mmcif_writer + +script_path=os.path.dirname(__file__) ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP INVERSE_ALLOWED_LIGAND_BONDS_TYPE_MAP = { @@ -105,45 +111,57 @@ def convert_to_json_compatible(obj): return [convert_to_json_compatible(i) for i in obj] else: return obj - -def get_msa_templates_pipeline(args) -> Dict: - use_precomputed_msas = True # FLAGS.use_precomputed_msas + +def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str: + """Helper function to resolve the binary path.""" + if cfg_path and os.path.isfile(cfg_path): + return cfg_path + + if cfg_val:=shutil.which(default_binary_name): + logging.warning(f'Using resolved {default_binary_name}: {cfg_val}') + return cfg_val + + raise FileNotFoundError(f"Could not find a proper binary path for {default_binary_name}: {cfg_path}.") + +def get_msa_templates_pipeline(cfg: DictConfig) -> Dict: + use_precomputed_msas = True # Assuming this is a constant or should be set globally + template_searcher = hmmsearch.Hmmsearch( - binary_path=args.hmmsearch_binary_path, - hmmbuild_binary_path=args.hmmbuild_binary_path, - database_path=args.pdb_seqres_database_path) + binary_path=resolve_bin_path(cfg.bin.hmmsearch, 'hmmsearch'), + hmmbuild_binary_path=resolve_bin_path(cfg.bin.hmmbuild, 'hmmbuild'), + database_path=cfg.db.pdb_seqres) template_featurizer = templates.HmmsearchHitFeaturizer( - mmcif_dir=args.template_mmcif_dir, - max_template_date=args.max_template_date, + mmcif_dir=cfg.template.mmcif_dir, + max_template_date=cfg.template.max_date, max_hits=MAX_TEMPLATE_HITS, - kalign_binary_path=args.kalign_binary_path, + kalign_binary_path=resolve_bin_path(cfg.bin.kalign, 'kalign'), release_dates_path=None, - obsolete_pdbs_path=args.obsolete_pdbs_path) + obsolete_pdbs_path=cfg.template.obsolete_pdbs) monomer_data_pipeline = pipeline.DataPipeline( - jackhmmer_binary_path=args.jackhmmer_binary_path, - hhblits_binary_path=args.hhblits_binary_path, - hhsearch_binary_path=args.hhsearch_binary_path, - uniref90_database_path=args.uniref90_database_path, - mgnify_database_path=args.mgnify_database_path, - bfd_database_path=args.bfd_database_path, - uniclust30_database_path=args.uniclust30_database_path, - small_bfd_database_path=args.small_bfd_database_path , + jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'), + hhblits_binary_path=resolve_bin_path(cfg.bin.hhblits, 'hhblits'), + hhsearch_binary_path=resolve_bin_path(cfg.bin.hhsearch, 'hhsearch'), + uniref90_database_path=cfg.db.uniref90, + mgnify_database_path=cfg.db.mgnify, + bfd_database_path=cfg.db.bfd, + uniclust30_database_path=cfg.db.uniclust30, + small_bfd_database_path=cfg.db.small_bfd, template_searcher=template_searcher, template_featurizer=template_featurizer, - use_small_bfd=args.use_small_bfd, + use_small_bfd=cfg.use_small_bfd, use_precomputed_msas=use_precomputed_msas) prot_data_pipeline = pipeline_multimer.DataPipeline( monomer_data_pipeline=monomer_data_pipeline, - jackhmmer_binary_path=args.jackhmmer_binary_path, - uniprot_database_path=args.uniprot_database_path, + jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'), + uniprot_database_path=cfg.db.uniprot, use_precomputed_msas=use_precomputed_msas) rna_monomer_data_pipeline = pipeline_rna.RNADataPipeline( - hmmer_binary_path=args.nhmmer_binary_path, - rfam_database_path=args.rfam_database_path, + hmmer_binary_path=resolve_bin_path(cfg.bin.nhmmer, 'nhmmer'), + rfam_database_path=cfg.db.rfam, rnacentral_database_path=None, nt_database_path=None, species_identifer_map_path=None, @@ -156,7 +174,6 @@ def get_msa_templates_pipeline(args) -> Dict: 'protein': prot_data_pipeline, 'rna': rna_data_pipeline } - def ranking_all_predictions(output_dirs): ranking_score_path_map = {} for outpath in output_dirs: @@ -176,27 +193,29 @@ def ranking_all_predictions(output_dirs): rank_id += 1 @paddle.no_grad() -def eval(args, model, batch): - """evaluate a given dataset""" +def eval(cfg: DictConfig, model:RunModel, batch): + """Evaluate a given dataset""" model.eval() - # inference + # Inference def _forward_with_precision(batch): - if args.precision == "bf16" or args.bf16_infer: + precision=cfg.precision + if precision not in ('bf16','fp32',): + raise ValueError("Please choose precision from bf16 and fp32!") + + if cfg.precision == "bf16" or cfg.bf16_infer: black_list, white_list = get_custom_amp_list() with paddle.amp.auto_cast(enable=True, - custom_white_list=white_list, - custom_black_list=black_list, - level=args.amp_level, - dtype='bfloat16'): + custom_white_list=white_list, + custom_black_list=black_list, + level=cfg.amp_level, + dtype='bfloat16'): return model(batch, compute_loss=False) - elif args.precision == "fp32": - return model(batch, compute_loss=False) - else: - raise ValueError("Please choose precision from bf16 and fp32! ") + + return model(batch, compute_loss=False) res = _forward_with_precision(batch) - logger.info(f"Inference Succeeds...\n") + logger.info("Inference Succeeds...\n") return res @@ -430,52 +449,55 @@ def split_prediction(pred, rank): return prediction -def main(args): - set_logging_level(args.logging_level) +@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold') +def main(cfg: DictConfig): + set_logging_level(cfg.logging_level) """main function""" new_einsum = os.getenv("FLAGS_new_einsum", True) print(f'>>> PaddlePaddle commit: {paddle.version.commit}') print(f'>>> FLAGS_new_einsum: {new_einsum}') - print(f'>>> args:\n{args}') + print(f'>>> config:\n{cfg}') - all_entitys = preprocess_json_entity(args.input_json, args.output_dir) + all_entitys = preprocess_json_entity(cfg.input, cfg.output) ## check maxit binary path - if args.maxit_binary is not None: - assert os.path.exists(args.maxit_binary), \ - f"The maxit binary path {args.maxit_binary} does not exists." + maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit') + + RCSBROOT=os.path.dirname(maxit_binary) + os.environ['RCSBROOT']=RCSBROOT + ## check obabel + obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel') + os.environ['OBABEL_BIN']=obabel_bin - ### set seed for reproduce experiment results - seed = args.seed + ### Set seed for reproducibility + seed = cfg.seed if seed is None: seed = np.random.randint(10000000) else: - logger.warning('seed is only used for reproduction') + logger.warning('Seed is only used for reproduction') init_seed(seed) - - use_small_bfd = args.preset == 'reduced_dbs' - setattr(args, 'use_small_bfd', use_small_bfd) + use_small_bfd = cfg.preset.preset == 'reduced_dbs' + setattr(cfg, 'use_small_bfd', use_small_bfd) if use_small_bfd: - assert args.small_bfd_database_path is not None + assert cfg.db.small_bfd is not None else: - assert args.bfd_database_path is not None - assert args.uniclust30_database_path is not None + assert cfg.db.bfd is not None + assert cfg.db.uniclust30 is not None logger.info('Getting MSA/Template Pipelines...') msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args) - - ### create model - model_config = config.model_config(args.model_name) - print(f'>>> model_config:\n{model_config}') + ### Create model + model_config = config.model_config(cfg.job_id) + #print(f'>>> model_config:\n{model_config}') model = RunModel(model_config) - if (not args.init_model is None) and (not args.init_model == ""): - print(f"Load pretrain model from {args.init_model}") - pd_params = paddle.load(args.init_model) + if (not cfg.weight_path is None) and (cfg.weight_path != ""): + print(f"Load pretrain model from {cfg.weight_path}") + pd_params = paddle.load(cfg.weight_path) has_opt = 'optimizer' in pd_params if has_opt: @@ -483,42 +505,46 @@ def main(args): else: model.helixfold.set_state_dict(pd_params) - if args.precision == "bf16" and args.amp_level == "O2": + if cfg.precision == "bf16" and cfg.amp_level == "O2": raise NotImplementedError("bf16 O2 is not supported yet.") print(f"============ Data Loading ============") - job_base = pathlib.Path(args.input_json).stem - output_dir_base = pathlib.Path(args.output_dir).joinpath(job_base) + job_base = pathlib.Path(cfg.input).stem + output_dir_base = pathlib.Path(cfg.output).joinpath(job_base) msa_output_dir = output_dir_base.joinpath('msas') msa_output_dir.mkdir(parents=True, exist_ok=True) features_pkl = output_dir_base.joinpath('final_features.pkl') - feature_dict = feature_processing_aa.process_input_json( - all_entitys, - ccd_preprocessed_path=args.ccd_preprocessed_path, - msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict, - msa_output_dir=msa_output_dir) + if features_pkl.exists(): + with open(features_pkl, 'rb') as f: + feature_dict = pickle.load(f) + else: + feature_dict = feature_processing_aa.process_input_json( + all_entitys, + ccd_preprocessed_path=cfg.db.ccd_preprocessed, + msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict, + msa_output_dir=msa_output_dir) - # save features - with open(features_pkl, 'wb') as f: - pickle.dump(feature_dict, f, protocol=4) + # save features + with open(features_pkl, 'wb') as f: + pickle.dump(feature_dict, f, protocol=4) feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True) feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True) print(f"============ Start Inference ============") - infer_times = args.infer_times - if args.diff_batch_size > 0: - model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size + infer_times = cfg.infer_times + if cfg.diff_batch_size > 0: + model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size logger.info(f'Inference {infer_times} Times...') - logger.info(f" diffusion batch size {diff_batch_size}...\n") + logger.info(f"Diffusion batch size {diff_batch_size}...\n") all_pred_path = [] for infer_id in range(infer_times): logger.info(f'Start {infer_id}-th inference...\n') - prediction = eval(args, model, feature_dict) + prediction = eval(cfg, model, feature_dict) # save result prediction = split_prediction(prediction, diff_batch_size) @@ -530,7 +556,7 @@ def main(args): feature_dict=feature_dict, prediction=prediction[rank_id], output_dir=output_dir, - maxit_bin=args.maxit_binary) + maxit_bin=cfg.other.maxit_binary) all_pred_path.append(output_dir) # final ranking @@ -538,100 +564,5 @@ def main(args): ranking_all_predictions(all_pred_path) print(f'============ Inference finished ! ============') - if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--bf16_infer", action='store_true', default=False) - parser.add_argument("--seed", type=int, default=None, help="set seed for reproduce experiment results, None is do not set seed") - parser.add_argument("--logging_level", type=str, default="DEBUG", help="NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL") - parser.add_argument("--model_name", type=str, help='used to choose model config') - parser.add_argument("--init_model", type=str, default='') - parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32') - parser.add_argument("--amp_level", type=str, default='O1') - parser.add_argument("--infer_times", type=int, default=1) - parser.add_argument("--diff_batch_size", type=int, default=-1) - parser.add_argument('--input_json', type=str, - default=None, required=True, - help='Paths to json file, each containing ' - 'entity information including sequence, smiles or CCD, copies etc.') - parser.add_argument('--output_dir', type=str, - default=None, required=True, - help='Path to a directory that will store results.') - parser.add_argument('--ccd_preprocessed_path', type=str, - default=None, required=True, - help='Path to CCD preprocessed files.') - parser.add_argument('--jackhmmer_binary_path', type=str, - default='/usr/bin/jackhmmer', - help='Path to the JackHMMER executable.') - parser.add_argument('--hhblits_binary_path', type=str, - default='/usr/bin/hhblits', - help='Path to the HHblits executable.') - parser.add_argument('--hhsearch_binary_path', type=str, - default='/usr/bin/hhsearch', - help='Path to the HHsearch executable.') - parser.add_argument('--kalign_binary_path', type=str, - default='/usr/bin/kalign', - help='Path to the Kalign executable.') - parser.add_argument('--hmmsearch_binary_path', type=str, - default='/usr/bin/hmmsearch', - help='Path to the hmmsearch executable.') - parser.add_argument('--hmmbuild_binary_path', type=str, - default='/usr/bin/hmmbuild', - help='Path to the hmmbuild executable.') - - # binary path of the tool for RNA MSA searching - parser.add_argument('--nhmmer_binary_path', type=str, - default='/usr/bin/nhmmer', - help='Path to the nhmmer executable.') - - parser.add_argument('--uniprot_database_path', type=str, - default=None, required=True, - help='Path to the Uniprot database for use ' - 'by JackHMMER.') - parser.add_argument('--pdb_seqres_database_path', type=str, - default=None, required=True, - help='Path to the PDB ' - 'seqres database for use by hmmsearch.') - parser.add_argument('--uniref90_database_path', type=str, - default=None, required=True, - help='Path to the Uniref90 database for use ' - 'by JackHMMER.') - parser.add_argument('--mgnify_database_path', type=str, - default=None, required=True, - help='Path to the MGnify database for use by ' - 'JackHMMER.') - parser.add_argument('--bfd_database_path', type=str, default=None, - help='Path to the BFD database for use by HHblits.') - parser.add_argument('--small_bfd_database_path', type=str, default=None, - help='Path to the small version of BFD used ' - 'with the "reduced_dbs" preset.') - parser.add_argument('--uniclust30_database_path', type=str, default=None, - help='Path to the Uniclust30 database for use ' - 'by HHblits.') - # RNA MSA searching databases - parser.add_argument('--rfam_database_path', type=str, - default=None, required=True, - help='Path to the Rfam database for RNA MSA searching.') - parser.add_argument('--template_mmcif_dir', type=str, - default=None, required=True, - help='Path to a directory with template mmCIF ' - 'structures, each named .cif') - parser.add_argument('--max_template_date', type=str, - default=None, required=True, - help='Maximum template release date to consider. ' - 'Important if folding historical test sets.') - parser.add_argument('--obsolete_pdbs_path', type=str, - default=None, required=True, - help='Path to file containing a mapping from ' - 'obsolete PDB IDs to the PDB IDs of their ' - 'replacements.') - parser.add_argument('--preset', - default='full_dbs', required=False, - choices=['reduced_dbs', 'full_dbs'], - help='Choose preset model configuration - ' - 'no ensembling and smaller genetic database ' - 'config (reduced_dbs), no ensembling and full ' - 'genetic database config (full_dbs)') - parser.add_argument('--maxit_binary', type=str, default=None) - args = parser.parse_args() - main(args) + main() diff --git a/apps/protein_folding/helixfold3/utils/__init__.py b/apps/protein_folding/helixfold3/helixfold/utils/__init__.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/__init__.py rename to apps/protein_folding/helixfold3/helixfold/utils/__init__.py diff --git a/apps/protein_folding/helixfold3/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/misc.py rename to apps/protein_folding/helixfold3/helixfold/utils/misc.py diff --git a/apps/protein_folding/helixfold3/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/model.py rename to apps/protein_folding/helixfold3/helixfold/utils/model.py diff --git a/apps/protein_folding/helixfold3/utils/utils.py b/apps/protein_folding/helixfold3/helixfold/utils/utils.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/utils.py rename to apps/protein_folding/helixfold3/helixfold/utils/utils.py diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml new file mode 100644 index 00000000..bc988ca9 --- /dev/null +++ b/apps/protein_folding/helixfold3/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["poetry-core>=1.0.0,<2.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "helixfold" +version = "3.0.0" +description = "Code for helixfold v3" +authors = ["Name "] + +readme = "README.md" +license = "MIT" +repository = "https://github.com/PaddlePaddle/PaddleHelix/blob/dev/apps/protein_folding/helixfold3" +classifiers = [ + "Topic :: Scientific/Engineering :: Biochemistry", + "Topic :: Scientific/Engineering :: Protein Engineering" +] + + +packages = [ + { include = "helixfold" }, + { include = "helixfold/*.py" }, +] + + +[tool.poetry.dependencies] +python = "^3.8" + +absl-py = "0.13.0" +biopython = "1.79" +chex = "0.0.7" +dm-haiku = "0.0.4" +dm-tree = "0.1.6" +docker = "5.0.0" +immutabledict = "2.0.0" +jax = "0.2.14" +ml-collections = "0.1.0" +pandas = "1.3.4" +scipy = "1.9.0" +rdkit-pypi = "2022.9.5" +posebusters = "*" +hydra-core= "^1.3.2" +omegaconf = "^2.3.0" + + + +[tool.poetry.scripts] +helixfold = 'helixfold.inference:main' diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh new file mode 100644 index 00000000..30f008d6 --- /dev/null +++ b/apps/protein_folding/helixfold3/setup_env.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +ENV_NAME='helixfold' +CUDA=12.0 + +# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit + +# Install py env +conda create -n ${ENV_NAME} -y -c conda-forge pip python=3.9; +source activate ${ENV_NAME} +conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia + +conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 +conda install -y -c conda-forge openbabel + +python -m pip install --upgrade 'pip<24';pip install . --no-cache-dir + +pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl From a5925e0234e88bb8b57042e83402dc6b59cee3b3 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 10:52:13 +0800 Subject: [PATCH 2/6] use reduced bfd --- .../helixfold3/helixfold/config/helixfold.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index fd70ada0..60e8076d 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -29,7 +29,7 @@ bin: hmmsearch: null # Corresponds to --hmmsearch_binary_path hmmbuild: null # Corresponds to --hmmbuild_binary_path nhmmer: null # Corresponds to --nhmmer_binary_path - obabel: null + obabel: null # Inject to env as OBABEL_BIN # Database paths db: @@ -38,7 +38,7 @@ db: uniref90: /mnt/db/uniref90/uniref90.fasta # Corresponds to --uniref90_database_path, required field mgnify: /mnt/db/mgnify/mgy_clusters.fa # Corresponds to --mgnify_database_path, required field bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt # Corresponds to --bfd_database_path - small_bfd: null # Corresponds to --small_bfd_database_path + small_bfd: /mnt/db/reduced_bfd/bfd-first_non_consensus_sequences.fasta # Corresponds to --small_bfd_database_path uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02 # Corresponds to --uniclust30_database_path rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta # Corresponds to --rfam_database_path, required field ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz # Corresponds to --ccd_preprocessed_path, required field @@ -51,7 +51,7 @@ template: # Preset configuration preset: - preset: full_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] + preset: reduced_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] # Other configurations other: From ab1648aad453e64203c69d538536ddc6af8d7bee Mon Sep 17 00:00:00 2001 From: Ryan Garcia <47666442+RyanGarciaLI@users.noreply.github.com> Date: Thu, 15 Aug 2024 21:20:39 +0800 Subject: [PATCH 3/6] disable no MSA mode (#312) From eb15ef707d162b80959530daeaaf681934b95486 Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 11:37:57 +0800 Subject: [PATCH 4/6] feat: model config overriden add override flag fix: enable mutiple config overrides feat: model config overriden fix: imports doc: merge to readme fix:config add configs --- apps/protein_folding/helixfold3/README.md | 99 +++++++++---------- .../helixfold/config/helixfold.yaml | 17 +++- .../helixfold3/helixfold/inference.py | 12 ++- .../helixfold3/helixfold/model/config.py | 49 ++++++--- .../helixfold3/requirements.txt | 13 --- apps/protein_folding/helixfold3/run_infer.sh | 39 -------- apps/protein_folding/helixfold3/setup_env.sh | 18 ---- 7 files changed, 102 insertions(+), 145 deletions(-) delete mode 100644 apps/protein_folding/helixfold3/requirements.txt delete mode 100644 apps/protein_folding/helixfold3/run_infer.sh delete mode 100644 apps/protein_folding/helixfold3/setup_env.sh diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md index 12ed7041..7c1ba474 100644 --- a/apps/protein_folding/helixfold3/README.md +++ b/apps/protein_folding/helixfold3/README.md @@ -44,17 +44,26 @@ Locate to the directory of `helixfold` then run: ```bash # Install py env conda create -n helixfold -c conda-forge python=3.9 -conda install -y -c bioconda aria2 hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -n helixfold -conda install -y -c conda-forge openbabel -n helixfold # activate the conda environment conda activate helixfold +# adjust these version numbers as your situation +conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia +conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 +conda install -y -c conda-forge openbabel + # install paddlepaddle -python3 -m pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html # or lower version: https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl -python3 -m pip install -r requirements.txt +# downgrade pip +pip install --upgrade 'pip<24' + +# edit configuration file at `/helixfold/config/helixfold.yaml` to set your databases and binaries correctly + +# install HF3 as a python library +pip install . --no-cache-dir ``` Note: If you have a different version of python3 and cuda, please refer to [here](https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html) for the compatible PaddlePaddle `dev` package. @@ -125,58 +134,40 @@ sh run_infer.sh ``` The script is as follows, -```bash -#!/bin/bash - -PYTHON_BIN="PATH/TO/YOUR/PYTHON" -ENV_BIN="PATH/TO/YOUR/ENV" -MAXIT_SRC="PATH/TO/MAXIT/SRC" -DATA_DIR="PATH/TO/DATA" -export OBABEL_BIN="PATH/TO/OBABEL/BIN" -export PATH="$MAXIT_BIN/bin:$PATH" - -CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ - --maxit_binary "$MAXIT_SRC/bin/maxit" \ - --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \ - --hhblits_binary_path "$ENV_BIN/hhblits" \ - --hhsearch_binary_path "$ENV_BIN/hhsearch" \ - --kalign_binary_path "$ENV_BIN/kalign" \ - --hmmsearch_binary_path "$ENV_BIN/hmmsearch" \ - --hmmbuild_binary_path "$ENV_BIN/hmmbuild" \ - --nhmmer_binary_path "$ENV_BIN/nhmmer" \ - --preset='reduced_dbs' \ - --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \ - --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \ - --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \ - --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \ - --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \ - --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \ - --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \ - --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \ - --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \ - --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \ - --max_template_date=2020-05-14 \ - --input_json data/demo_protein_ligand.json \ - --output_dir ./output \ - --model_name allatom_demo \ - --init_model ./init_models/checkpoints.pdparams \ - --infer_times 3 \ - --precision "fp32" +##### Run from default config +```shell +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ +helixfold \ + input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_8ecx.json \ + output=. CONFIG_DIFFS.preset=allatom_demo ``` + +##### Run with customized configuration dir and file(`./myfold.yaml`, for example): +```shell +LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH \ +helixfold --config-dir=. --config-name=myfold \ + input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy_smiles.json \ + output=. CONFIG_DIFFS.preset=allatom_demo +``` + +##### Run with additional configuration term +```shell +LD_LIBRARY_PATH=/mnt/data/envs/conda_env/envs/helixfold/lib/:$LD_LIBRARY_PATH \ +helixfold \ + input=/repo/PaddleHelix/apps/protein_folding/helixfold3/data/demo_6zcy.json \ + output=. \ + CONFIG_DIFFS.model.heads.confidence_head.weight=0.01 \ + CONFIG_DIFFS.model.global_config.subbatch_size=192 +``` + The descriptions of the above script are as follows: -* Replace `MAXIT_SRC` with your installed `maxit`'s root path. -* Replace `DATA_DIR` with your downloaded data path. -* Replace `OBABEL_BIN` with your installed `openbabel` path. -* Replace `ENV_BIN` with your conda virtual environment or any environment where `hhblits`, `hmmsearch` and other dependencies have been installed. -* `--preset` - Set `'reduced_dbs'` to use small bfd or `'full_dbs'` to use full bfd. -* `--*_database_path` - Path to datasets you have downloaded. -* `--input_json` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference. -* `--output_dir` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. -* `--model_name` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. -* `--infer_time` - The number of inferences executed by model for single input. In each inference, the model will infer `5` times (`diff_batch_size`) for the same input by default. This hyperparameter can be changed by `model.head.diffusion_module.test_diff_batch_size` within `./helixfold/model/config.py` -* `--precision` - Either `bf16` or `fp32`. Please check if your machine can support `bf16` or not beforing changing it. For example, `bf16` is supported by A100 and H100 or higher version while V100 only supports `fp32`. +* `LD_LIBRARY_PATH` - This is required to load the `libcudnn.so` library if you encounter issue like `RuntimeError: (PreconditionNotMet) Cannot load cudnn shared library. Cannot invoke method cudnnGetVersion.` +* `config-dir` - The directory that contains the alterative configuration file you would like to use. +* `config-name` - The name of the configuration file you would like to use. +* `input` - Input data in the form of JSON. Input pattern in `./data/demo_*.json` for your reference. +* `output` - Model output path. The output will be in a folder named the same as your `--input_json` under this path. +* `CONFIG_DIFFS.preset` - Model name in `./helixfold/model/config.py`. Different model names specify different configurations. Mirro modification to configuration can be specified in `CONFIG_DIFFS` in the `config.py` without change to the full configuration in `CONFIG_ALLATOM`. +* `CONFIG_DIFFS.*` - Override model any configuration in `CONFIG_ALLATOM`. ### Understanding Model Output diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 60e8076d..4900fdca 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -13,11 +13,13 @@ amp_level: O1 # Corresponds to --amp_level infer_times: 1 # Corresponds to --infer_times diff_batch_size: -1 # Corresponds to --diff_batch_size use_small_bfd: false # Corresponds to --use_small_bfd +msa_only: false # Only process msa # File paths input: null # Corresponds to --input_json, required field output: null # Corresponds to --output_dir, required field +override: false # Set true to override existing msa output directory # Binary tool paths, leave them as null to find proper ones under PATH or conda bin path @@ -55,5 +57,16 @@ preset: # Other configurations other: - maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary - no_msa_templ_feats: false # Corresponds to --no_msa_templ_feats + maxit_binary: /mnt/data/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary + + +# CONFIG_DIFFS for advanced configuration +CONFIG_DIFFS: + preset: null #choices=['null','allatom_demo', 'allatom_subbatch_64_recycle_1'] + # model: + # global_config: + # subbatch_size: 96 # model.global_config.subbatch_size + # num_recycle: 3 # model.num_recycle + # heads: + # confidence_head: + # weight: 0.0 # model.heads.confidence_head.weight diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index b3fbf745..f67e18a5 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -36,15 +36,16 @@ from helixfold.data import pipeline_rna_parallel as pipeline_rna from helixfold.data import pipeline_rna_multimer from helixfold.data.utils import atom_level_keys, map_to_continuous_indices +from helixfold.utils.model import RunModel from helixfold.data.tools import hmmsearch from helixfold.data import templates from helixfold.utils.utils import get_custom_amp_list -from helixfold.utils.model import RunModel from helixfold.utils.misc import set_logging_level from typing import Dict from helixfold.infer_scripts import feature_processing_aa, preprocess from helixfold.infer_scripts.tools import mmcif_writer + script_path=os.path.dirname(__file__) ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP @@ -487,11 +488,11 @@ def main(cfg: DictConfig): assert cfg.db.uniclust30 is not None logger.info('Getting MSA/Template Pipelines...') - msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args) + msa_templ_data_pipeline_dict = get_msa_templates_pipeline(cfg=cfg) ### Create model - model_config = config.model_config(cfg.job_id) - #print(f'>>> model_config:\n{model_config}') + model_config = config.model_config(cfg.CONFIG_DIFFS) + logging.warning(f'>>> Model config: \n{model_config}\n\n') model = RunModel(model_config) @@ -515,8 +516,9 @@ def main(cfg: DictConfig): msa_output_dir.mkdir(parents=True, exist_ok=True) features_pkl = output_dir_base.joinpath('final_features.pkl') - if features_pkl.exists(): + if features_pkl.exists() and not cfg.override: with open(features_pkl, 'rb') as f: + logging.info(f'Load features from precomputed {features_pkl}') feature_dict = pickle.load(f) else: feature_dict = feature_processing_aa.process_input_json( diff --git a/apps/protein_folding/helixfold3/helixfold/model/config.py b/apps/protein_folding/helixfold3/helixfold/model/config.py index 6da8566a..f9dbbf1d 100644 --- a/apps/protein_folding/helixfold3/helixfold/model/config.py +++ b/apps/protein_folding/helixfold3/helixfold/model/config.py @@ -15,7 +15,8 @@ """Model config.""" import copy -import ml_collections +from typing import Any, Union +from omegaconf import DictConfig NUM_RES = 'num residues placeholder' @@ -24,27 +25,47 @@ NUM_TEMPLATES = 'num templates placeholder' -def model_config(name: str) -> ml_collections.ConfigDict: +def model_config(config_diffs: Union[str, DictConfig, dict[str, dict[str, Any]]]) -> DictConfig: """Get the ConfigDict of a model.""" cfg = copy.deepcopy(CONFIG_ALLATOM) - if name in CONFIG_DIFFS: - cfg.update_from_flattened_dict(CONFIG_DIFFS[name]) + if config_diffs is None or config_diffs=='': + # early return if nothing is changed + return cfg - return cfg + if isinstance(config_diffs, DictConfig): + if 'preset' in config_diffs and (preset_name:=config_diffs['preset']) in CONFIG_DIFFS: + updated_config=CONFIG_DIFFS[preset_name] + cfg.merge_with_dotlist(updated_config) + print(f'Updated config from `CONFIG_DIFFS.{preset_name}`: {updated_config}') + + # update from detailed configuration + if any(root_kw in config_diffs for root_kw in CONFIG_ALLATOM): -CONFIG_DIFFS = { - 'allatom_demo': { - 'model.heads.confidence_head.weight': 0.01 - }, - 'allatom_subbatch_64_recycle_1': { - 'model.global_config.subbatch_size': 64, - 'model.num_recycle': 1, - }, + for root_kw in CONFIG_ALLATOM: + if root_kw not in config_diffs: + continue + cfg.merge_with(DictConfig({root_kw:config_diffs[root_kw]})) # merge to override + print(f'Updated config from `CONFIG_DIFFS`:{root_kw}: {config_diffs[root_kw]}') + + return cfg + + raise ValueError(f'Invalid config_diffs ({type(config_diffs)}): {config_diffs}') + + +# preset for runs +CONFIG_DIFFS: dict[str, list[str]] = { + 'allatom_demo': [ + 'model.heads.confidence_head.weight=0.01' + ], + 'allatom_subbatch_64_recycle_1': [ + 'model.global_config.subbatch_size=64', + 'model.num_recycle=1', + ] } -CONFIG_ALLATOM = ml_collections.ConfigDict({ +CONFIG_ALLATOM = DictConfig({ 'data': { 'num_blocks': 5, # for msa block deletion 'randomize_num_blocks': True, diff --git a/apps/protein_folding/helixfold3/requirements.txt b/apps/protein_folding/helixfold3/requirements.txt deleted file mode 100644 index 660e43c1..00000000 --- a/apps/protein_folding/helixfold3/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -absl-py==0.13.0 -biopython==1.79 -chex==0.0.7 -dm-haiku==0.0.4 -dm-tree==0.1.6 -docker==5.0.0 -immutabledict==2.0.0 -jax==0.2.14 -ml-collections==0.1.0 -pandas==1.3.4 -scipy==1.9.0 -rdkit-pypi==2022.9.5 -posebusters \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh deleted file mode 100644 index 5b0644e5..00000000 --- a/apps/protein_folding/helixfold3/run_infer.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -PYTHON_BIN="/usr/bin/python3" # changes to your python -ENV_BIN="/root/miniconda3/bin" # change to your env -MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT -export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel -DATA_DIR="./data" -export PATH="$MAXIT_SRC/bin:$PATH" - -CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ - --maxit_binary "$MAXIT_SRC/bin/maxit" \ - --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \ - --hhblits_binary_path "$ENV_BIN/hhblits" \ - --hhsearch_binary_path "$ENV_BIN/hhsearch" \ - --kalign_binary_path "$ENV_BIN/kalign" \ - --hmmsearch_binary_path "$ENV_BIN/hmmsearch" \ - --hmmbuild_binary_path "$ENV_BIN/hmmbuild" \ - --nhmmer_binary_path "$ENV_BIN/nhmmer" \ - --preset='reduced_dbs' \ - --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \ - --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \ - --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \ - --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \ - --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \ - --mgnify_database_path "$DATA_DIR/mgnify/mgy_clusters_2018_12.fa" \ - --template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \ - --obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \ - --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \ - --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \ - --max_template_date=2020-05-14 \ - --input_json data/demo_6zcy.json \ - --output_dir ./output \ - --model_name allatom_demo \ - --init_model init_models/HelixFold3-240814.pdparams \ - --infer_times 1 \ - --diff_batch_size 1 \ - --precision "fp32" \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh deleted file mode 100644 index 30f008d6..00000000 --- a/apps/protein_folding/helixfold3/setup_env.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -ENV_NAME='helixfold' -CUDA=12.0 - -# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit - -# Install py env -conda create -n ${ENV_NAME} -y -c conda-forge pip python=3.9; -source activate ${ENV_NAME} -conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia - -conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 -conda install -y -c conda-forge openbabel - -python -m pip install --upgrade 'pip<24';pip install . --no-cache-dir - -pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl From fbb2e05a7ed8590e793473bc6659eb7996ca774d Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:27:45 +0800 Subject: [PATCH 5/6] chore: cpu only for msa only --- .../helixfold3/helixfold/inference.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index f67e18a5..d176d057 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -454,6 +454,12 @@ def split_prediction(pred, rank): def main(cfg: DictConfig): set_logging_level(cfg.logging_level) + if cfg.msa_only == True: + logging.warning(f'Model inference will be skipped because MSA-only mode is required.') + logging.warning(f'Use CPU only') + paddle.device.set_device("cpu") + + """main function""" new_einsum = os.getenv("FLAGS_new_einsum", True) print(f'>>> PaddlePaddle commit: {paddle.version.commit}') @@ -505,6 +511,8 @@ def main(cfg: DictConfig): model.helixfold.set_state_dict(pd_params['model']) else: model.helixfold.set_state_dict(pd_params) + + if cfg.precision == "bf16" and cfg.amp_level == "O2": raise NotImplementedError("bf16 O2 is not supported yet.") @@ -531,6 +539,10 @@ def main(cfg: DictConfig): with open(features_pkl, 'wb') as f: pickle.dump(feature_dict, f, protocol=4) + if cfg.msa_only == True: + logging.warning(f'Model inference is skipped because MSA-only mode is required.') + exit() + feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True) feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True) From 62dbaa4c74c0b7c6025f3fbed7e0fdf312a556cf Mon Sep 17 00:00:00 2001 From: YaoYinYing <33014714+YaoYinYing@users.noreply.github.com> Date: Sat, 17 Aug 2024 16:05:58 +0800 Subject: [PATCH 6/6] fix: maxit run with env fix: maxit path fix: maxit run with env --- .../helixfold/common/all_atom_pdb_save.py | 29 ++++++++++++++++--- .../helixfold/config/helixfold.yaml | 1 - .../helixfold3/helixfold/inference.py | 28 +++++++++--------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index 9c9f288e..92e7d225 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -21,6 +21,7 @@ import paddle import itertools import os +import subprocess FeatureDict = Mapping[str, np.ndarray] ModelOutput = Mapping[str, Any] # Is a nested dict. @@ -164,7 +165,7 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], - maxit_binary: path to maxit_binary, use to convert pdb to cif - mmcif_path: path to save *.cif """ - if os.path.isfile(maxit_binary): + if not os.path.isfile(maxit_binary): raise FileNotFoundError( f'maxit_binary: {maxit_binary} not exists. ' f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') @@ -174,7 +175,27 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], pdb_path = mmcif_path.replace('.cif', '.pdb') pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path) - msg = os.system(f'{maxit_binary} -i {pdb_path} -o 1 -output {mmcif_path}') - if msg != 0: - print(f'convert pdb to cif failed, error message: {msg}') + + cmd=[maxit_binary, + '-i', pdb_path, + '-o', '1', + '-output', mmcif_path, + ] + + print('Launching subprocess "%s"', ' '.join(cmd)) + + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) + + + stdout, stderr = process.communicate() + retcode = process.wait() + + + if retcode: + # Logs have a 15k character limit, so log HHblits error line by line. + print('maxit failed. HHblits stderr begin:') + raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % ( + stdout.decode('utf-8'), stderr[:500_000].decode('utf-8'))) + return mmcif_path \ No newline at end of file diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml index 4900fdca..047ee386 100644 --- a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -6,7 +6,6 @@ defaults: bf16_infer: false # Corresponds to --bf16_infer seed: null # Corresponds to --seed logging_level: DEBUG # Corresponds to --logging_level -job_id: 'structure_prediction' # Corresponds to --model_name weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams # Corresponds to --init_model precision: fp32 # Corresponds to --precision amp_level: O1 # Corresponds to --amp_level diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py index d176d057..0531ead7 100644 --- a/apps/protein_folding/helixfold3/helixfold/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -194,29 +194,27 @@ def ranking_all_predictions(output_dirs): rank_id += 1 @paddle.no_grad() -def eval(cfg: DictConfig, model:RunModel, batch): - """Evaluate a given dataset""" +def eval(args, model, batch): + """evaluate a given dataset""" model.eval() - # Inference + # inference def _forward_with_precision(batch): - precision=cfg.precision - if precision not in ('bf16','fp32',): - raise ValueError("Please choose precision from bf16 and fp32!") - - if cfg.precision == "bf16" or cfg.bf16_infer: + if args.precision == "bf16" or args.bf16_infer: black_list, white_list = get_custom_amp_list() with paddle.amp.auto_cast(enable=True, - custom_white_list=white_list, - custom_black_list=black_list, - level=cfg.amp_level, - dtype='bfloat16'): + custom_white_list=white_list, + custom_black_list=black_list, + level=args.amp_level, + dtype='bfloat16'): return model(batch, compute_loss=False) - - return model(batch, compute_loss=False) + elif args.precision == "fp32": + return model(batch, compute_loss=False) + else: + raise ValueError("Please choose precision from bf16 and fp32! ") res = _forward_with_precision(batch) - logger.info("Inference Succeeds...\n") + logger.info(f"Inference Succeeds...\n") return res