PaddlePaddle
diff --git a/‎.github/LIT-PCBA_result.png
316 KB b/‎.github/LIT-PCBA_result.png
316 KB
diff --git a/‎.github/optimus_framework3.png
722 KB b/‎.github/optimus_framework3.png
722 KB
diff --git a/‎.github/pcqm4mv2_result.png
251 KB b/‎.github/pcqm4mv2_result.png
251 KB
diff --git a/‎README.md
+2 b/‎README.md
+2
diff --git a/‎README_cn.md
+2 b/‎README_cn.md
+2
diff --git a/‎apps/drug_drug_synergy/RGCN/train.py
+1-1 b/‎apps/drug_drug_synergy/RGCN/train.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pairwise/DeepDTA/utils.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pairwise/DeepDTA/utils.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pairwise/GraphDTA/run_pairwise_GraphDTA_CV.py
+2-2 b/‎apps/drug_target_interaction/batchdta/pairwise/GraphDTA/run_pairwise_GraphDTA_CV.py
+2-2
diff --git a/‎apps/drug_target_interaction/batchdta/pairwise/Moltrans/helper/utils/paddle_tensor.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pairwise/Moltrans/helper/utils/paddle_tensor.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pairwise/Moltrans/run_pairwise_Moltrans_CV.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pairwise/Moltrans/run_pairwise_Moltrans_CV.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pairwise/Moltrans/run_pairwise_Moltrans_bindingDB.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pairwise/Moltrans/run_pairwise_Moltrans_bindingDB.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pointwise/DeepDTA/train_bindingdb.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pointwise/DeepDTA/train_bindingdb.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pointwise/DeepDTA/train_davis.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pointwise/DeepDTA/train_davis.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pointwise/DeepDTA/train_kiba.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pointwise/DeepDTA/train_kiba.py
+1-1
diff --git a/‎apps/drug_target_interaction/batchdta/pointwise/Moltrans/helper/utils/paddle_tensor.py
+1-1 b/‎apps/drug_target_interaction/batchdta/pointwise/Moltrans/helper/utils/paddle_tensor.py
+1-1
diff --git a/‎apps/drug_target_interaction/moltrans_dti/helper/utils/paddle_tensor.py
+1-1 b/‎apps/drug_target_interaction/moltrans_dti/helper/utils/paddle_tensor.py
+1-1
diff --git a/‎apps/fewshot_molecular_property/chem_lib/models/trainer.py
+1-1 b/‎apps/fewshot_molecular_property/chem_lib/models/trainer.py
+1-1
diff --git a/‎apps/molecular_generation/SD_VAE/train_zinc.py
+3-3 b/‎apps/molecular_generation/SD_VAE/train_zinc.py
+3-3
diff --git a/‎apps/pretrained_compound/ChemRL/GEM-2/README.md
+51-28 b/‎apps/pretrained_compound/ChemRL/GEM-2/README.md
+51-28
diff --git a/‎apps/pretrained_compound/ChemRL/GEM-2/src/optimus.py
+9 b/‎apps/pretrained_compound/ChemRL/GEM-2/src/optimus.py
+9
diff --git a/‎apps/pretrained_compound/ChemRL/GEM-2/src/paddle_utils.py
+4-4 b/‎apps/pretrained_compound/ChemRL/GEM-2/src/paddle_utils.py
+4-4
diff --git a/‎apps/pretrained_compound/ChemRL/GEM-2/train_gem2.py
+2-20 b/‎apps/pretrained_compound/ChemRL/GEM-2/train_gem2.py
+2-20
diff --git a/‎apps/protein_folding/helixfold-single/README.md
+2-2 b/‎apps/protein_folding/helixfold-single/README.md
+2-2
diff --git a/‎apps/protein_folding/helixfold-single/data/T1026.fasta
+2 b/‎apps/protein_folding/helixfold-single/data/T1026.fasta
+2
@@ -12,6 +12,8 @@ English | [简体中文](README_cn.md)
 
 
 ## Latest News
+`2022.08.11` PaddleHelix released the codes of HelixGEM-2, a novel Molecular Property Prediction Network that models full-range many-body interactions. And it ranked 1st in the OGB [PCQM4Mv2](https://ogb.stanford.edu/docs/lsc/leaderboards/) leaderboard. Please refer to [paper](https://arxiv.org/abs/2208.05863) and [codes](./apps/pretrained_compound/ChemRL/GEM-2) for more details.
+
 `2022.07.29` PaddleHelix released the codes of HelixFold-Single, an **MSA-free** protein structure prediction pipeline relying on only the primary sequences, which can **predict the protein structures within seconds**. Please refer to [paper](https://arxiv.org/abs/2207.13921) and [codes](./apps/protein_folding/helixfold-single) for more details. Welcome to [PaddleHelix website](https://paddlehelix.baidu.com/app/drug/protein-single/forecast
 ) to try out the structure prediction online service.
 
 
@@ -10,6 +10,8 @@
 ![support os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-yellow.svg)
 
 ## 最新消息
+`2022.08.11` 螺旋桨团队开源了HelixGEM-2的代码, 它是一个全新的基于长程多体建模的小分子属性预测框架，并在OGB [PCQM4Mv2](https://ogb.stanford.edu/docs/lsc/leaderboards/) 排行榜取得第一的成绩。详情参见 [论文](https://arxiv.org/abs/2208.05863) 和 [代码](./apps/pretrained_compound/ChemRL/GEM-2)。
+
 `2022.07.29` 螺旋桨团队开源了HelixFold-Single的代码，HelixFold-Single是一个**不依赖于MSA的**蛋白质结构预测流程，仅仅需要一级序列作为输入就可以提供**秒级别的蛋白质结构预测**。详情参见[论文](https://arxiv.org/abs/2207.13921)和[代码](./apps/protein_folding/helixfold-single)。欢迎到[PaddleHelix网站](https://paddlehelix.baidu.com/app/drug/protein-single/forecast
 )去试用结构预测的在线服务。
 
 
@@ -87,7 +87,7 @@ def train(num_subgraph, graph, label_idx, epochs, sub_neighbours=[10, 10], init=
                 fpr, tpr, _ = roc_curve(y_true=ground_truth, y_score=pred_prob)
                 auc_v = auc(fpr, tpr)
                 print("sub_graph index : {} | epoch: {} | training loss: {:.4f} | AUC: {:.3f}".format(
-                sub_g, epoch, train_loss.numpy()[0], auc_v))
+                sub_g, epoch, float(train_loss), auc_v))
 
     return model
 
 
@@ -312,7 +312,7 @@ def model_eval(model,val_dataloader):
 
         for i_target_score in range(batch_smiles.shape[0]):
 
-            i_target_len = int(batch_len[i_target_score].numpy()[0])
+            i_target_len = int(batch_len[i_target_score])
             smiles = batch_smiles[i_target_score][0:i_target_len]
             target = batch_protein[i_target_score][0:i_target_len]
             y_label = batch_y[i_target_score][0:i_target_len].numpy()
 
@@ -195,9 +195,9 @@ def model_eval(model,val_dataloader,device):
                 i_data = i_data.to(device)                    
                 pred_scores = model.forward_single(i_data)
                 # get the predicted labels
-                i_target_pred_scores.append(pred_scores.cpu().numpy()[0])              
+                i_target_pred_scores.append(float(pred_scores))              
                 # get the true labels
-                i_target_y_label.append(i_data.y.cpu().numpy()[0])
+                i_target_y_label.append(float(i_data.y.cpu()))
 
             i_target_pred_scores = np.array(i_target_pred_scores)
             i_target_y_label = np.array(i_target_y_label)
 
@@ -32,7 +32,7 @@ def item(self):
     """
     Item function
     """
-    return self.numpy()[0]
+    return float(self)
 
 
 @add_tensor_function
 
@@ -297,7 +297,7 @@ def model_eval(model,val_dataloader,len_SMILES,len_target):
 
         for i_target_score in range(batch_x.shape[0]):
 
-            i_target_len = int(batch_len[i_target_score].numpy()[0])
+            i_target_len = int(batch_len[i_target_score])
             smiles = batch_x_smiles[i_target_score][0:i_target_len]
             target = batch_x_protein[i_target_score][0:i_target_len]
             smiles_mask = batch_x_smiles_mask[i_target_score][0:i_target_len]
 
@@ -282,7 +282,7 @@ def model_eval(model,val_dataloader,len_SMILES,len_target):
 
         for i_target_score in range(batch_x.shape[0]):
 
-            i_target_len = int(batch_len[i_target_score].numpy()[0])
+            i_target_len = int(batch_len[i_target_score])
             smiles = batch_x_smiles[i_target_score][0:i_target_len]
             target = batch_x_protein[i_target_score][0:i_target_len]
             smiles_mask = batch_x_smiles_mask[i_target_score][0:i_target_len]
 
@@ -60,7 +60,7 @@ def training(model, training_loader, optim):
         optim.clear_grad()
         loss.backward()
         optim.step()
-        res_loss = loss.numpy()[0]
+        res_loss = float(loss)
     return res_loss
 
 
 
@@ -60,7 +60,7 @@ def training(model, training_loader, optim):
         optim.clear_grad()
         loss.backward()
         optim.step()
-        res_loss = loss.numpy()[0]
+        res_loss = float(loss)
     return res_loss
 
 
 
@@ -63,7 +63,7 @@ def training(model, training_loader, optim):
         optim.clear_grad()
         loss.backward()
         optim.step()
-        res_loss = loss.numpy()[0]
+        res_loss = float(loss.numpy())
     return res_loss
 
 
 
@@ -32,7 +32,7 @@ def item(self):
     """
     Item function
     """
-    return self.numpy()[0]
+    return float(self)
 
 
 @add_tensor_function
 
@@ -32,7 +32,7 @@ def item(self):
     """
     Item function
     """
-    return self.numpy()[0]
+    return float(self.numpy())
 
 
 @add_tensor_function
 
@@ -294,7 +294,7 @@ def train_step(self):
             losses_eval.backward()
             self.optimizer.step()
 
-            print('Train Epoch:',self.train_epoch,', train update step:', k, ', loss_eval:', losses_eval.numpy()[0])
+            print('Train Epoch:',self.train_epoch,', train update step:', k, ', loss_eval:', float(losses_eval))
 
         return self.model.layers
 
 
@@ -122,9 +122,9 @@ def _train_epoch(model, data_loader, epoch, kl_weight, optimizer=None):
             optimizer.clear_grad()
 
         # Log
-        kl_loss_values.append(kl_loss.numpy()[0])
-        perplexity_loss_values.append(perplexity.numpy()[0])
-        loss_values.append(loss.numpy()[0])
+        kl_loss_values.append(float(kl_loss))
+        perplexity_loss_values.append(float(perplexity))
+        loss_values.append(float(loss))
         lr = (optimizer.get_lr()
                   if optimizer is not None
                   else 0)
 
@@ -1,8 +1,27 @@
-# GEM-2: Next Generation Molecular Property Prediction Network with Many-body and Full-range Interaction Modeling
-Molecular property prediction is a fundamental task in the drug and material industries. Physically, the properties of a molecule are determined by its own electronic structure, which can be exactly described by the Schrödinger equation. However, solving the Schrödinger equation for most molecules is extremely challenging due to long-range interactions in the behavior of a quantum many-body system. While deep learning methods have proven to be effective in molecular property prediction, we design a novel method, namely GEM-2, which comprehensively considers both the long-range and many-body interactions in molecules. GEM-2 consists of two interacted tracks: an atom-level track modeling both the local and global correlation between any two atoms, and a pair-level track modeling the correlation between all atom pairs, which embed information between any 3 or 4 atoms. Extensive experiments demonstrated the superiority of GEM-2 over multiple baseline methods in quantum chemistry and drug discovery tasks.
-
+# GEM-2: Next Generation Molecular Property Prediction Network by Modeling Full-range Many-body Interactions
+GEM-2 is a molecular modeling framework which comprehensively considers full-range many-body interactions in molecules. Multiple tracks are utilized to model the full-range interactions between the many-bodies with different orders, and a novel axial attention mechanism is designed to approximate the full-range interaction modeling with much lower computational cost.
 A preprint version of our work can be found [here](https://arxiv.org/abs/2208.05863).
 
+## Framework
+<p align="center">
+<img src="../../../../.github/optimus_framework3.png" align="middle" heigh="70%" width="70%" />
+</p>
+
+The overall framework of GEM-2. First, a molecule is described by the representations of many-bodies of multiple orders. Then, Optimus blocks are designed to update the representations. Each Optimus block contains $M$ tracks, and the $m$-th track contains a stack of many-body axial attentions to model the full-range interactions between the $m$-bodies. The many-body axial attentions and the Low2High module also play the roles of exchanging messages across the tracks. Finally, the molecular property prediction is made by pooling over the $1$-body representations.
+## Result
+### PCQM4Mv2
+PCQM4Mv2 is a large-scale quantum chemistry dataset containing the DFT-calculated HOMO-LUMO
+energy gaps. The OGB leaderboard for PCQM4Mv2 can be found [here](https://ogb.stanford.edu/docs/lsc/leaderboards/#pcqm4mv2).
+<p align="center">
+<img src="../../../../.github/pcqm4mv2_result.png" align="middle" heigh="70%" width="70%" />
+</p>
+
+### LITPCBA
+LIT-PCBA is a virtual screening dataset containing protein targets with their corresponding active and inactive compounds selected from high-confidence PubChem Bioassay data
+<p align="center">
+<img src="../../../../.github/LIT-PCBA_result.png" align="middle" heigh="70%" width="70%" />
+</p>
+
 # Installation guide
 ## Prerequisites
 
@@ -27,50 +46,53 @@ Firstly, download or clone the lastest github repository:
     git checkout dev
     cd apps/pretrained_compound/ChemRL/GEM-2
 
-# Data
+## Data
 You can download the PCQM4Mv2 dataset from ogb website: 
 
-    https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/pcqm4m-v2.zip
+    wget https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/pcqm4m-v2.zip
 
-# Processed Data
-You can download the processed PCQM4Mv2 dataset with rdkit generated 3d information from:
-    https://baidu-nlp.bj.bcebos.com/PaddleHelix/datasets/compound_datasets/pcqm4mv2_gem2.tgz
+You can also download the processed PCQM4Mv2 dataset with rdkit generated 3d information from [here](https://baidu-nlp.bj.bcebos.com/PaddleHelix/datasets/compound_datasets/pcqm4mv2_gem2.tgz).
 And then use tar to unzip the data.
 ```bash
-  mkdir -p ../data
-  tar xzf pcqm4mv2_gem2.tgz -C ../data
+wget https://baidu-nlp.bj.bcebos.com/PaddleHelix/datasets/compound_datasets/pcqm4mv2_gem2.tgz
+mkdir -p ../data
+tar xzf pcqm4mv2_gem2.tgz -C ../data
+```
+
+## Pretrained checkpoints
+We release the checkpoint for reproducing the results on PCQM4Mv2, which can also serve as a pretrain model for downstream tasks.
+```bash
+wget https://baidu-nlp.bj.bcebos.com/PaddleHelix/models/molecular_modeling/gem2_l12_c256.pdparams
+mkdir -p model
+mv gem2_l12_c256.pdparams model
 ```
 
-# How to run
-## Introduction to related configs
+## How to run
+### Introduction to related configs
 You can adjsut the json files in the config folder to  change the training settings.
-### dataset_config
+#### dataset_config
 - `data_dir`: where the data located
 - `task_names`: the name of the label column in the datafile
 
-### model_config
+#### model_config
 - `model`: model related information, like the channel size, dropout
 - `data`: data transform setting
 
 
-
-### train_config
+#### train_config
 - `lr`: learning rate
 - `warmup_step`: the step to warm up learning rate to lr
 - `mid_step`: steps before learning rate decay
 
-## Start training 
+### Train on PCQM4Mv2
 
     sh scripts/train.sh
 
-The models will be saved under `./model`.
+The models will be saved under `./model`. It will take around 60 mintues to finish one epoch on 16 A100 cards with total batch size of 512.
 
-It will take around 60 mintues to finish one epoch on 16 A100 cards with total batch size of 512.
+### Inference on valid set of PCQM4Mv2
+To reproduce the result from the ogb leaderboard, just run the inference command:
 
-## Run inference
-To reproduce the result from the ogb leaderboard, you can download the checkponit from:
-    https://baidu-nlp.bj.bcebos.com/PaddleHelix/models/molecular_modeling/gem2_l12_c256.pdparams
-Then put it under the local `./model` folder and run the inference command:
     sh scripts/inference.sh
 
 
@@ -80,10 +102,11 @@ If you use the code or data in this repos, please cite:
 
 ```bibtex
 @article{liu2022gem-2,
-  title={GEM-2: Next Generation Molecular Property Prediction Network with Many-body and Full-range Interaction Modeling
-},
-  author={Liu, Lihang and He, Donglong and Fang, Xiaomin and Zhang, Shanzhuo and Wang, Fan and He, Jingzhou and Wu, Hua},
-  journal={arXiv preprint arXiv:2208.05863},
-  year={2022}
+  doi = {10.48550/ARXIV.2208.05863},
+  url = {https://arxiv.org/abs/2208.05863},
+  author = {Liu, Lihang and He, Donglong and Fang, Xiaomin and Zhang, Shanzhuo and Wang, Fan and He, Jingzhou and Wu, Hua},  
+  title = {GEM-2: Next Generation Molecular Property Prediction Network by Modeling Full-range Many-body Interactions},
+  publisher = {arXiv},
+  year = {2022}
 }
 ```
@@ -534,6 +534,15 @@ def init_weights(self, layer):
         elif isinstance(layer, nn.LayerNorm):
             layer._epsilon = 1e-12
 
+    def reduce_dropout(self):
+        """
+        setting the model's dropout rate to 0
+        """
+        def reduce_p(layer):
+            if isinstance(layer, nn.Dropout):
+                layer.p = 0
+        self.apply(reduce_p)
+
     def _create_mask(self, batch):
         node_mask = batch["node_mask"]  # (B, N)
 
 
@@ -37,8 +37,8 @@ def dist_mean(array, distributed=False):
     n = len(array)
     x_sum = 0 if n == 0 else np.sum(array)
     if distributed:
-        n = dist_all_reduce(paddle.to_tensor(n, dtype='int64')).numpy()[0]
-        x_sum = dist_all_reduce(paddle.to_tensor(x_sum, dtype='float32')).numpy()[0]
+        n = int(dist_all_reduce(paddle.to_tensor(n, dtype='int64')))
+        x_sum = float(dist_all_reduce(paddle.to_tensor(x_sum, dtype='float32')))
     x_mean = 0 if n == 0 else x_sum / n
     return x_mean
 
@@ -47,14 +47,14 @@ def dist_sum(array, distributed=False):
     n = len(array)
     x_sum = 0 if n == 0 else np.sum(array)
     if distributed:
-        x_sum = dist_all_reduce(paddle.to_tensor(x_sum, dtype='float32')).numpy()[0]
+        x_sum = float(dist_all_reduce(paddle.to_tensor(x_sum, dtype='float32')))
     return x_sum
 
 
 def dist_length(array, distributed=False):
     n = len(array)
     if distributed:
-        n = dist_all_reduce(paddle.to_tensor(n, dtype='int64')).numpy()[0]
+        n = int(dist_all_reduce(paddle.to_tensor(n, dtype='int64')))
     return n
 
 
 
@@ -80,7 +80,7 @@ def get_train_steps_per_epoch(dataset_len, args):
         min_data_len = paddle.to_tensor(dataset_len)
         from paddle.distributed import ReduceOp
         dist.all_reduce(min_data_len, ReduceOp.MIN)
-        dataset_len = min_data_len.numpy()[0]
+        dataset_len = int(min_data_len)
         logging.info(f'min dataset len: {dataset_len}')
     return int(dataset_len / args.batch_size) - 5
 
@@ -228,24 +228,6 @@ def evaluate(args, epoch_id, model, test_dataset, collate_fn):
     return mean_mae
 
 
-def adjust_dropout(model_config, encoder_config, last_ck_path):
-    """
-    adjust the dropout rate of the model to achieve better performance
-    """
-    encoder_config.init_dropout_rate = 0
-    encoder_config.optimus_block.first_body_axial_attention_dropout = 0
-    encoder_config.optimus_block.pair_dropout_rate = 0
-    encoder_config.optimus_block.first_body_axial_attention.dropout_rate = 0
-    encoder_config.optimus_block.node_ffn.dropout_rate = 0
-    encoder_config.optimus_block.second_body_first_axis.dropout_rate = 0
-    encoder_config.optimus_block.second_body_second_axis.dropout_rate = 0
-    encoder_config.optimus_block.pair_ffn.dropout_rate = 0
-    model = MolRegressionModel(model_config, encoder_config)
-    model.set_state_dict(paddle.load(last_ck_path))
-    print('Load state_dict from %s' % last_ck_path)
-    return model
-
-
 def main(args):
     """
     Call the configuration function of the model, build the model and load data, then start training.
@@ -326,7 +308,7 @@ def _read_json(path):
             ema.register()
 
         if epoch_id == 69:
-            model = adjust_dropout(model_config, encoder_config, f'./{args.model_dir}/epoch_{epoch_id - 1}.pdparams')
+            model.encoder.reduce_dropout()
 
         ## train
         s_time = time.time()
 
@@ -16,8 +16,7 @@ To reproduce the results reported in our paper, specific environment settings ar
 
 ## Installation
 Except those listed in the `requirements.txt`, PaddlePaddle `dev` package is required to run HelixFold.
-Visit [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html) to 
-install PaddlePaddle `dev`. Also, we provide a package here if your machine environment is Nvidia A100 with
+Visit [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html) to install PaddlePaddle `dev`. Also, we provide a package here if your machine environment is Nvidia A100 with
 cuda=11.2.
 
 ```bash
@@ -45,6 +44,7 @@ python helixfold_single_inference.py \
 
 - `init_model`: the trained model.
 - `fasta_file`: the fasta_file file which contains the protein sequence to be predicted.
+- `output_dir`: the path to the output.
 
 The output is organized as：
 
 
@@ -0,0 +1,2 @@
+>T1026 FBNSV, , 172 residues|
+MVSNWNWSGKKGRRTPRRGYTRPFKSAVPTTRVVVHQSAVLKKDDVSGSEIKPEGDVARYKIRKVMLSCTLRMRPGELVNYLIVKCSSPIVNWSAAFTAPALMVKESCQDMITIIGKGKVESNGVAGSDCTKSFNKFIRLGAGISQTQHLYVVMYTSEAVKTVLEHRVYIEV
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>T1026 FBNSV, , 172 residues\|`
	`2`	`+MVSNWNWSGKKGRRTPRRGYTRPFKSAVPTTRVVVHQSAVLKKDDVSGSEIKPEGDVARYKIRKVMLSCTLRMRPGELVNYLIVKCSSPIVNWSAAFTAPALMVKESCQDMITIIGKGKVESNGVAGSDCTKSFNKFIRLGAGISQTQHLYVVMYTSEAVKTVLEHRVYIEV`