Skip to content

Hf3 bugs fix and update readme #307

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 113 additions & 71 deletions apps/protein_folding/helixfold3/README.md

Large diffs are not rendered by default.

Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def invert_point(self, transformed_point, extra_dims=0):
transformed_point[1] - translation[..., 1],
transformed_point[2] - translation[..., 2]]

return apply_inverse_rot_to_vec(rotation, rot_point)O
return apply_inverse_rot_to_vec(rotation, rot_point)


######Paddle Implementation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
3: 'Unknown error.'
}

OBABEL_BIN = os.getenv('OBABEL_BIN', '/root/paddlejob/workspace/output/yangpan/program_files/anaconda3/bin/conda')
OBABEL_BIN = os.getenv('OBABEL_BIN')
if not os.path.exists(OBABEL_BIN):
raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')

Expand Down
10 changes: 7 additions & 3 deletions apps/protein_folding/helixfold3/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,12 +517,15 @@ def main(args):
print(f"============ Start Inference ============")

infer_times = args.infer_times
diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size
logger.info(f'Inference {infer_times} Times...\n')
if args.diff_batch_size > 0:
model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size
diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size
logger.info(f'Inference {infer_times} Times...')
logger.info(f" diffusion batch size {diff_batch_size}...\n")
all_pred_path = []
for infer_id in range(infer_times):

logger.info(f'Start {infer_id}-th inference, rank {diff_batch_size}...\n')
logger.info(f'Start {infer_id}-th inference...\n')
prediction = eval(args, model, feature_dict)

# save result
Expand Down Expand Up @@ -553,6 +556,7 @@ def main(args):
parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32')
parser.add_argument("--amp_level", type=str, default='O1')
parser.add_argument("--infer_times", type=int, default=1)
parser.add_argument("--diff_batch_size", type=int, default=-1)
parser.add_argument('--input_json', type=str,
default=None, required=True,
help='Paths to json file, each containing '
Expand Down
3 changes: 1 addition & 2 deletions apps/protein_folding/helixfold3/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ docker==5.0.0
immutabledict==2.0.0
jax==0.2.14
ml-collections==0.1.0
numpy==1.19.5
pandas==1.3.4
scipy==1.7.0
scipy==1.9.0
rdkit-pypi==2022.9.5
posebusters
17 changes: 9 additions & 8 deletions apps/protein_folding/helixfold3/run_infer.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/bin/bash

PYTHON_BIN="/usr/bin/python3"
ENV_BIN="/root/miniconda3/bin"
# MAXIT_SRC="/root/paddlejob/workspace/output/yexianbin/dcu_command/maxit-v11.200-prod-src/"
MAXIT_SRC="/home/rudder_paddle_home/custom_workspace/init_models/paddlehelix/paddlefold/maxit-v11.100-prod-src"
PYTHON_BIN="/usr/bin/python3" # changes to your python
ENV_BIN="/root/miniconda3/bin" # change to your env
MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT
export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel
DATA_DIR="./data"
export PATH="$MAXIT_SRC/hin:$PATH"
export PATH="$MAXIT_SRC/bin:$PATH"

CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
--maxit_binary "$MAXIT_SRC/bin/maxit" \
Expand All @@ -27,12 +27,13 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
--template_mmcif_dir "$DATA_DIR/pdb_mmcif/mmcif_files" \
--obsolete_pdbs_path "$DATA_DIR/pdb_mmcif/obsolete.dat" \
--ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
--rfam_database_path "$DATA_DIR/RNA_MSA_databases/Rfam-14.9_rep_seq.fasta" \
--rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
--max_template_date=2020-05-14 \
--input_json data/demo_6zcy.json \
--output_dir ./output \
--model_name allatom_demo \
--init_model init_models/helixfold_aa_init_model.pdparams \
--infer_times 3 \
--init_model init_models/HelixFold3-240814.pdparams \
--infer_times 1 \
--diff_batch_size 1 \
--precision "bf16" \
--no_msa_templ_feats # comment it to enable MSA searching
60 changes: 60 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_all_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash
# Usage: bash download_all_data.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs.
if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]]
then
echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized."
exit 1
fi

SCRIPT_DIR="$(dirname "$(realpath "$0")")"

echo "Downloading HelixFold3 checkpoints..."
bash "${SCRIPT_DIR}/download_helixfold3_checkpoints.sh" "${DOWNLOAD_DIR}"

if [[ "${DOWNLOAD_MODE}" = reduced_dbs ]] ; then
echo "Downloading Small BFD..."
bash "${SCRIPT_DIR}/download_small_bfd.sh" "${DOWNLOAD_DIR}"
else
echo "Downloading BFD..."
bash "${SCRIPT_DIR}/download_bfd.sh" "${DOWNLOAD_DIR}"
fi

echo "Downloading MGnify..."
bash "${SCRIPT_DIR}/download_mgnify.sh" "${DOWNLOAD_DIR}"

echo "Downloading PDB mmCIF files..."
bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"

echo "Downloading Uniclust30..."
bash "${SCRIPT_DIR}/download_uniclust30.sh" "${DOWNLOAD_DIR}"

echo "Downloading Uniref90..."
bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}"

echo "Downloading UniProt..."
bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}"

echo "Downloading PDB SeqRes..."
bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}"

echo "Downloading RNA MSA..."
bash "${SCRIPT_DIR}/download_rna.sh" "${DOWNLOAD_DIR}"

echo "Downloading CCD pickel..."
bash "${SCRIPT_DIR}/download_ccd_pkl.sh" "${DOWNLOAD_DIR}"

echo "All data downloaded."
26 changes: 26 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_bfd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
# Usage: bash download_bfd.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/bfd"
# Mirror of:
# https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz.
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}"
rm "${ROOT_DIR}/${BASENAME}"
21 changes: 21 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_ccd_pkl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# Usage: bash download_uniref90.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}"
SOURCE_URL="https://paddlehelix.bd.bcebos.com/HelixFold3/CCD/ccd_preprocessed_etkdg.pkl.gz"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
# Usage: bash download_helixfold3_checkpoints.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/params"
SOURCE_URL="https://paddlehelix.bd.bcebos.com/HelixFold3/params/HelixFold3-params-240814.zip"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}" --preserve-permissions
rm "${ROOT_DIR}/${BASENAME}"
26 changes: 26 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_mgnify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
# Usage: bash download_mgnify.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
# Mirror of:
# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
popd
44 changes: 44 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_pdb_mmcif.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
# Usage: bash download_pdb_mmcif.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

if ! command -v rsync &> /dev/null ; then
echo "Error: rsync could not be found. Please install rsync."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/pdb_mmcif"
RAW_DIR="${ROOT_DIR}/raw"
MMCIF_DIR="${ROOT_DIR}/mmcif_files"

echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..."
mkdir --parents "${RAW_DIR}"
rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
"${RAW_DIR}"

echo "Unzipping all mmCIF files..."
find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} +

echo "Flattening all mmCIF files..."
mkdir --parents "${MMCIF_DIR}"
find "${RAW_DIR}" -type d -empty -delete # Delete empty directories.
for subdir in "${RAW_DIR}"/*; do
mv "${subdir}/"*.cif "${MMCIF_DIR}"
done

# Delete empty download directory structure.
find "${RAW_DIR}" -type d -empty -delete

aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
21 changes: 21 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_pdb_seqres.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# Usage: bash download_pdb_seqres.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/pdb_seqres"
SOURCE_URL="ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
21 changes: 21 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_rna.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# Usage: bash download_small_bfd.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/"
SOURCE_URL="https://paddlehelix.bd.bcebos.com/HelixFold3/MSA/Rfam-14.9_rep_seq.fasta"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
24 changes: 24 additions & 0 deletions apps/protein_folding/helixfold3/scripts/download_small_bfd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
# Usage: bash download_small_bfd.sh /path/to/download/directory
set -e

if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi

if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi

DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/small_bfd"
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz"
BASENAME=$(basename "${SOURCE_URL}")

mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
popd
Loading