Skip to content

Commit 9ed5d0c

Browse files
authored
Hf3 bugs fix and update readme (#307)
* rm dup READMD and fix typo * support rank infer and update readme * fix bugs * udpate requirements * fix typo * upload scripts * refine citation * fix typo
1 parent 8f9db31 commit 9ed5d0c

20 files changed

+487
-87
lines changed

apps/protein_folding/helixfold3/README.md

Lines changed: 113 additions & 71 deletions
Large diffs are not rendered by default.
Binary file not shown.

apps/protein_folding/helixfold3/helixfold/data/templates_quat_affine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ def invert_point(self, transformed_point, extra_dims=0):
303303
transformed_point[1] - translation[..., 1],
304304
transformed_point[2] - translation[..., 2]]
305305

306-
return apply_inverse_rot_to_vec(rotation, rot_point)O
306+
return apply_inverse_rot_to_vec(rotation, rot_point)
307307

308308

309309
######Paddle Implementation

apps/protein_folding/helixfold3/infer_scripts/preprocess.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
3: 'Unknown error.'
5353
}
5454

55-
OBABEL_BIN = os.getenv('OBABEL_BIN', '/root/paddlejob/workspace/output/yangpan/program_files/anaconda3/bin/conda')
55+
OBABEL_BIN = os.getenv('OBABEL_BIN')
5656
if not os.path.exists(OBABEL_BIN):
5757
raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')
5858

apps/protein_folding/helixfold3/inference.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -517,12 +517,15 @@ def main(args):
517517
print(f"============ Start Inference ============")
518518

519519
infer_times = args.infer_times
520-
diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size
521-
logger.info(f'Inference {infer_times} Times...\n')
520+
if args.diff_batch_size > 0:
521+
model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size
522+
diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size
523+
logger.info(f'Inference {infer_times} Times...')
524+
logger.info(f" diffusion batch size {diff_batch_size}...\n")
522525
all_pred_path = []
523526
for infer_id in range(infer_times):
524527

525-
logger.info(f'Start {infer_id}-th inference, rank {diff_batch_size}...\n')
528+
logger.info(f'Start {infer_id}-th inference...\n')
526529
prediction = eval(args, model, feature_dict)
527530

528531
# save result
@@ -553,6 +556,7 @@ def main(args):
553556
parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32')
554557
parser.add_argument("--amp_level", type=str, default='O1')
555558
parser.add_argument("--infer_times", type=int, default=1)
559+
parser.add_argument("--diff_batch_size", type=int, default=-1)
556560
parser.add_argument('--input_json', type=str,
557561
default=None, required=True,
558562
help='Paths to json file, each containing '

apps/protein_folding/helixfold3/requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ docker==5.0.0
77
immutabledict==2.0.0
88
jax==0.2.14
99
ml-collections==0.1.0
10-
numpy==1.19.5
1110
pandas==1.3.4
12-
scipy==1.7.0
11+
scipy==1.9.0
1312
rdkit-pypi==2022.9.5
1413
posebusters

apps/protein_folding/helixfold3/run_infer.sh

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
#!/bin/bash
22

3-
PYTHON_BIN="/usr/bin/python3"
4-
ENV_BIN="/root/miniconda3/bin"
5-
# MAXIT_SRC="/root/paddlejob/workspace/output/yexianbin/dcu_command/maxit-v11.200-prod-src/"
6-
MAXIT_SRC="/home/rudder_paddle_home/custom_workspace/init_models/paddlehelix/paddlefold/maxit-v11.100-prod-src"
3+
PYTHON_BIN="/usr/bin/python3" # changes to your python
4+
ENV_BIN="/root/miniconda3/bin" # change to your env
5+
MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT
6+
export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel
77
DATA_DIR="./data"
8-
export PATH="$MAXIT_SRC/hin:$PATH"
8+
export PATH="$MAXIT_SRC/bin:$PATH"
99

1010
CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
1111
--maxit_binary "$MAXIT_SRC/bin/maxit" \
@@ -27,12 +27,13 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
2727
--template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \
2828
--obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \
2929
--ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
30-
--rfam_database_path "$DATA_DIR/RNA_MSA_databases/Rfam-14.9_rep_seq.fasta" \
30+
--rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
3131
--max_template_date=2020-05-14 \
3232
--input_json data/demo_6zcy.json \
3333
--output_dir ./output \
3434
--model_name allatom_demo \
35-
--init_model init_models/helixfold_aa_init_model.pdparams \
36-
--infer_times 3 \
35+
--init_model init_models/HelixFold3-240814.pdparams \
36+
--infer_times 1 \
37+
--diff_batch_size 1 \
3738
--precision "bf16" \
3839
--no_msa_templ_feats # comment it to enable MSA searching
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/bin/bash
2+
# Usage: bash download_all_data.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs.
17+
if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]]
18+
then
19+
echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized."
20+
exit 1
21+
fi
22+
23+
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
24+
25+
echo "Downloading HelixFold3 checkpoints..."
26+
bash "${SCRIPT_DIR}/download_helixfold3_checkpoints.sh" "${DOWNLOAD_DIR}"
27+
28+
if [[ "${DOWNLOAD_MODE}" = reduced_dbs ]] ; then
29+
echo "Downloading Small BFD..."
30+
bash "${SCRIPT_DIR}/download_small_bfd.sh" "${DOWNLOAD_DIR}"
31+
else
32+
echo "Downloading BFD..."
33+
bash "${SCRIPT_DIR}/download_bfd.sh" "${DOWNLOAD_DIR}"
34+
fi
35+
36+
echo "Downloading MGnify..."
37+
bash "${SCRIPT_DIR}/download_mgnify.sh" "${DOWNLOAD_DIR}"
38+
39+
echo "Downloading PDB mmCIF files..."
40+
bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"
41+
42+
echo "Downloading Uniclust30..."
43+
bash "${SCRIPT_DIR}/download_uniclust30.sh" "${DOWNLOAD_DIR}"
44+
45+
echo "Downloading Uniref90..."
46+
bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}"
47+
48+
echo "Downloading UniProt..."
49+
bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}"
50+
51+
echo "Downloading PDB SeqRes..."
52+
bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}"
53+
54+
echo "Downloading RNA MSA..."
55+
bash "${SCRIPT_DIR}/download_rna.sh" "${DOWNLOAD_DIR}"
56+
57+
echo "Downloading CCD pickel..."
58+
bash "${SCRIPT_DIR}/download_ccd_pkl.sh" "${DOWNLOAD_DIR}"
59+
60+
echo "All data downloaded."
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
# Usage: bash download_bfd.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}/bfd"
17+
# Mirror of:
18+
# https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz.
19+
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz"
20+
BASENAME=$(basename "${SOURCE_URL}")
21+
22+
mkdir --parents "${ROOT_DIR}"
23+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
24+
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
25+
--directory="${ROOT_DIR}"
26+
rm "${ROOT_DIR}/${BASENAME}"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
# Usage: bash download_uniref90.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}"
17+
SOURCE_URL="https://paddlehelix.bd.bcebos.com/HelixFold3/CCD/ccd_preprocessed_etkdg.pkl.gz"
18+
BASENAME=$(basename "${SOURCE_URL}")
19+
20+
mkdir --parents "${ROOT_DIR}"
21+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
# Usage: bash download_helixfold3_checkpoints.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}/params"
17+
SOURCE_URL="https://paddlehelix.bd.bcebos.com/HelixFold3/params/HelixFold3-params-240814.zip"
18+
BASENAME=$(basename "${SOURCE_URL}")
19+
20+
mkdir --parents "${ROOT_DIR}"
21+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
22+
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
23+
--directory="${ROOT_DIR}" --preserve-permissions
24+
rm "${ROOT_DIR}/${BASENAME}"
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
# Usage: bash download_mgnify.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
17+
# Mirror of:
18+
# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz
19+
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz"
20+
BASENAME=$(basename "${SOURCE_URL}")
21+
22+
mkdir --parents "${ROOT_DIR}"
23+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
24+
pushd "${ROOT_DIR}"
25+
gunzip "${ROOT_DIR}/${BASENAME}"
26+
popd
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
# Usage: bash download_pdb_mmcif.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
if ! command -v rsync &> /dev/null ; then
16+
echo "Error: rsync could not be found. Please install rsync."
17+
exit 1
18+
fi
19+
20+
DOWNLOAD_DIR="$1"
21+
ROOT_DIR="${DOWNLOAD_DIR}/pdb_mmcif"
22+
RAW_DIR="${ROOT_DIR}/raw"
23+
MMCIF_DIR="${ROOT_DIR}/mmcif_files"
24+
25+
echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..."
26+
mkdir --parents "${RAW_DIR}"
27+
rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
28+
rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
29+
"${RAW_DIR}"
30+
31+
echo "Unzipping all mmCIF files..."
32+
find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} +
33+
34+
echo "Flattening all mmCIF files..."
35+
mkdir --parents "${MMCIF_DIR}"
36+
find "${RAW_DIR}" -type d -empty -delete # Delete empty directories.
37+
for subdir in "${RAW_DIR}"/*; do
38+
mv "${subdir}/"*.cif "${MMCIF_DIR}"
39+
done
40+
41+
# Delete empty download directory structure.
42+
find "${RAW_DIR}" -type d -empty -delete
43+
44+
aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
# Usage: bash download_pdb_seqres.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}/pdb_seqres"
17+
SOURCE_URL="ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
18+
BASENAME=$(basename "${SOURCE_URL}")
19+
20+
mkdir --parents "${ROOT_DIR}"
21+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
# Usage: bash download_small_bfd.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}/"
17+
SOURCE_URL="https://paddlehelix.bd.bcebos.com/HelixFold3/MSA/Rfam-14.9_rep_seq.fasta"
18+
BASENAME=$(basename "${SOURCE_URL}")
19+
20+
mkdir --parents "${ROOT_DIR}"
21+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
# Usage: bash download_small_bfd.sh /path/to/download/directory
3+
set -e
4+
5+
if [[ $# -eq 0 ]]; then
6+
echo "Error: download directory must be provided as an input argument."
7+
exit 1
8+
fi
9+
10+
if ! command -v aria2c &> /dev/null ; then
11+
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
12+
exit 1
13+
fi
14+
15+
DOWNLOAD_DIR="$1"
16+
ROOT_DIR="${DOWNLOAD_DIR}/small_bfd"
17+
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz"
18+
BASENAME=$(basename "${SOURCE_URL}")
19+
20+
mkdir --parents "${ROOT_DIR}"
21+
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
22+
pushd "${ROOT_DIR}"
23+
gunzip "${ROOT_DIR}/${BASENAME}"
24+
popd

0 commit comments

Comments
 (0)