Add the processed data and checkpoint download link

hedonglong · hedonglong · commit f74a1e0fa04b · 2022-10-21T11:02:16.000+08:00
diff --git a/apps/pretrained_compound/ChemRL/GEM-2/README.md b/apps/pretrained_compound/ChemRL/GEM-2/README.md
@@ -32,6 +32,15 @@ You can download the PCQM4Mv2 dataset from ogb website:
     
     https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/pcqm4m-v2.zip
 
+# Processed Data
+You can download the processed PCQM4Mv2 dataset with rdkit generated 3d information from:
+    https://baidu-nlp.bj.bcebos.com/PaddleHelix/datasets/compound_datasets/pcqm4mv2_gem2.tgz
+And then use tar to unzip the data.
+```bash
+  mkdir -p ../data
+  tar xzf pcqm4mv2_gem2.tgz -C ../data
+```
+
 # How to run
 ## Introduction to related configs
 You can adjsut the json files in the config folder to  change the training settings.
@@ -58,6 +67,12 @@ The models will be saved under `./model`.
 
 It will take around 60 mintues to finish one epoch on 16 A100 cards with total batch size of 512.
 
+## Run inference
+To reproduce the result from the ogb leaderboard, you can download the checkponit from:
+    https://baidu-nlp.bj.bcebos.com/PaddleHelix/models/molecular_modeling/gem2_l12_c256.pdparams
+Then put it under the local `./model` folder and run the inference command:
+    sh scripts/inference.sh
+
 
 ## Citing this work
 
diff --git a/apps/pretrained_compound/ChemRL/GEM-2/scripts/inference.sh b/apps/pretrained_compound/ChemRL/GEM-2/scripts/inference.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+cd $(dirname $0)
+cd ..
+
+train_pcqm(){
+    mkdir -p log/$exp_name model/$exp_name
+    python3 train_gem2.py  \
+        --inference \
+        --batch_size=$batch_size \
+        --num_workers=10 \
+        --max_epoch=150 \
+        --dataset_config=$dataset_config \
+        --data_cache_dir=$data_cache_dir \
+        --model_config=$model_config \
+        --encoder_config=$encoder_config \
+        --train_config=$train_config \
+        --init_model=$init_model \
+        --start_step=$start_step \
+        --model_dir=./model/$exp_name \
+        --log_dir=./log/$exp_name
+}
+
+
+exp_name=""
+batch_size=32
+dataset_config="configs/dataset_configs/pcqmv2.json"
+data_cache_dir="../data/pcqm4m-v2-rdkit3d"
+model_config="configs/model_configs/mol_regr-optimus-mae.json"
+encoder_config="configs/model_configs/opt3d_l12_c256.json"
+init_model="./model/gem2_l12_c256.pdparams"
+train_config="configs/train_configs/lr4e-4-mid40.json"
+start_step=0
+train_pcqm
diff --git a/apps/pretrained_compound/ChemRL/GEM-2/scripts/train.sh b/apps/pretrained_compound/ChemRL/GEM-2/scripts/train.sh
@@ -22,7 +22,7 @@ train_pcqm(){
         
 }
 
-
+exp_name="gem2_l12_c256_4e-4"
 echo "$exp_name"
 
 
diff --git a/apps/pretrained_compound/ChemRL/GEM-2/src/config.py b/apps/pretrained_compound/ChemRL/GEM-2/src/config.py
@@ -84,9 +84,9 @@ def make_updated_config(base_config, updated_dict):
 
     "optimus_block_num": 12,
     "optimus_block": {
-        "node_dropout_rate": 0.05,
+        "first_body_axial_attention_dropout": 0.05,
         "pair_dropout_rate": 0.05,
-        "node_attention": {
+        "first_body_axial_attention": {
             "use_pair_layer_norm": True,
             "num_head": 8,
             "dropout_rate": 0.05,
@@ -96,15 +96,15 @@ def make_updated_config(base_config, updated_dict):
             "hidden_factor": 4,
             "dropout_rate": 0.1
         },
-        "outer_product": {
+        "low2high": {
             "inner_channel": 32
         },
-        "triangle_attention_start_node": {
+        "second_body_first_axis": {
             "num_head": 8,
             "dropout_rate": 0.05,
             "is_start": True
         },
-        "triangle_attention_end_node": {
+        "second_body_second_axis": {
             "num_head": 8,
             "dropout_rate": 0.05,
             "is_start": False
diff --git a/apps/pretrained_compound/ChemRL/GEM-2/src/optimus.py b/apps/pretrained_compound/ChemRL/GEM-2/src/optimus.py
@@ -122,10 +122,10 @@ def forward(self, batch):
         return results
         
 
-class NodeAttention(nn.Layer):
+class FirstBodyAxialAttention(nn.Layer):
     """Compute self-attention over columns of a 2D input."""
     def __init__(self, model_config, global_config):
-        super(NodeAttention, self).__init__()
+        super(FirstBodyAxialAttention, self).__init__()
         self.model_config = model_config
 
         node_channel = global_config.node_channel
@@ -253,9 +253,9 @@ def forward(self, x):
         return x
 
 
-class OuterProductMean(nn.Layer):
+class Low2HighModule(nn.Layer):
     def __init__(self, model_config, global_config):
-        super(OuterProductMean, self).__init__()
+        super(Low2HighModule, self).__init__()
         node_channel = global_config.node_channel
         pair_channel = global_config.pair_channel
         inner_channel = model_config.inner_channel
@@ -285,9 +285,9 @@ def forward(self, node_acts, node_mask):
         return act
 
 
-class TriangleAttentionWithAngle(nn.Layer):
+class SecondBodyAxialAttentionWithAngle(nn.Layer):
     def __init__(self, model_config, global_config):
-        super(TriangleAttentionWithAngle, self).__init__()
+        super(SecondBodyAxialAttentionWithAngle, self).__init__()
         pair_channel = global_config.pair_channel
         triple_channel = global_config.triple_channel
         self.num_head = model_config.num_head
@@ -342,9 +342,9 @@ def forward(self, pair_acts, triple_acts, bias):
         return out
 
 
-class TriangleAttentionWithAngleBias(nn.Layer):
+class SecondBodyAxialAttentionWithAngleBias(nn.Layer):
     def __init__(self, model_config, global_config):
-        super(TriangleAttentionWithAngleBias, self).__init__()
+        super(SecondBodyAxialAttentionWithAngleBias, self).__init__()
         pair_channel = global_config.pair_channel
         triple_channel = global_config.triple_channel
         self.num_head = model_config.num_head
@@ -395,15 +395,15 @@ def forward(self, pair_acts, triple_acts, bias):
         return out
 
 
-class TriangleAttention(nn.Layer):
+class SecondBodyAxialAttention(nn.Layer):
     def __init__(self, model_config, global_config):
-        super(TriangleAttention, self).__init__()
+        super(SecondBodyAxialAttention, self).__init__()
         self.is_start = model_config.is_start
 
         if model_config.get('angle_as_bias', False):
-            self.attn_mod = TriangleAttentionWithAngleBias(model_config, global_config)
+            self.attn_mod = SecondBodyAxialAttentionWithAngleBias(model_config, global_config)
         else:
-            self.attn_mod = TriangleAttentionWithAngle(model_config, global_config)
+            self.attn_mod = SecondBodyAxialAttentionWithAngle(model_config, global_config)
     
     def forward(self, pair_acts, triple_acts, triple_mask):
         """
@@ -431,29 +431,29 @@ def __init__(self, model_config, global_config):
         pair_channel = global_config.pair_channel
         
         ### node track
-        self.node_attn = NodeAttention(
-                model_config.node_attention, global_config)
-        self.node_attn_dropout = nn.Dropout(model_config.node_dropout_rate)
+        self.first_body_axial_attention = FirstBodyAxialAttention(
+                model_config.first_body_axial_attention, global_config)
+        self.first_body_axial_attention_dropout = nn.Dropout(model_config.first_body_axial_attention_dropout)
 
         self.node_ffn = FeedForwardNetwork(
                 model_config.node_ffn, node_channel)
-        self.node_ffn_dropout = nn.Dropout(model_config.node_dropout_rate)
+        self.node_ffn_dropout = nn.Dropout(model_config.first_body_axial_attention_dropout)
 
-        ### outer
-        self.outer_product = OuterProductMean(
-                model_config.outer_product, global_config)
-        self.outer_product_dropout = nn.Dropout(model_config.pair_dropout_rate)
+        ### low2high
+        self.low2high = Low2HighModule(
+                model_config.low2high, global_config)
+        self.low2high_dropout = nn.Dropout(model_config.pair_dropout_rate)
 
         ### pair track
         self.pair_before_ln = nn.LayerNorm(pair_channel)
 
-        self.triangle_attn_start = TriangleAttention(
-                model_config.triangle_attention_start_node, global_config)
-        self.triangle_attn_start_dropout = nn.Dropout(model_config.pair_dropout_rate)
+        self.second_body_first_axis = SecondBodyAxialAttention(
+                model_config.second_body_first_axis, global_config)
+        self.second_body_first_axis_dropout = nn.Dropout(model_config.pair_dropout_rate)
 
-        self.triangle_attn_end = TriangleAttention(
-                model_config.triangle_attention_end_node, global_config)
-        self.triangle_attn_end_dropout = nn.Dropout(model_config.pair_dropout_rate)
+        self.second_body_second_axis = SecondBodyAxialAttention(
+                model_config.second_body_second_axis, global_config)
+        self.second_body_second_axis_dropout = nn.Dropout(model_config.pair_dropout_rate)
 
         self.pair_ffn = FeedForwardNetwork(
                 model_config.pair_ffn, pair_channel)
@@ -476,24 +476,24 @@ def forward(self, node_acts, pair_acts, triple_acts, mask_dict):
         triple_mask = mask_dict['triple']
 
         # node track
-        residual = self.node_attn(node_acts, pair_acts, node_mask, pair_mask)
-        node_acts += self.node_attn_dropout(residual)
+        residual = self.first_body_axial_attention(node_acts, pair_acts, node_mask, pair_mask)
+        node_acts += self.first_body_axial_attention_dropout(residual)
 
         residual = self.node_ffn(node_acts)
         node_acts += self.node_ffn_dropout(residual)
 
         # outer
-        outer = self.outer_product(node_acts, node_mask)
-        pair_acts += self.outer_product_dropout(outer)
+        outer = self.low2high(node_acts, node_mask)
+        pair_acts += self.low2high_dropout(outer)
 
         # pair track
         pair_acts = self.pair_before_ln(pair_acts)
 
-        residual = self.triangle_attn_start(pair_acts, triple_acts, triple_mask)
-        pair_acts += self.triangle_attn_start_dropout(residual)
+        residual = self.second_body_first_axis(pair_acts, triple_acts, triple_mask)
+        pair_acts += self.second_body_first_axis_dropout(residual)
 
-        residual = self.triangle_attn_end(pair_acts, triple_acts, triple_mask)
-        pair_acts += self.triangle_attn_end_dropout(residual)
+        residual = self.second_body_second_axis(pair_acts, triple_acts, triple_mask)
+        pair_acts += self.second_body_second_axis_dropout(residual)
 
         residual = self.pair_ffn(pair_acts)
         pair_acts += self.pair_ffn_dropout(residual)
diff --git a/apps/pretrained_compound/ChemRL/GEM-2/train_gem2.py b/apps/pretrained_compound/ChemRL/GEM-2/train_gem2.py
@@ -75,15 +75,13 @@ def get_optimizer(args, train_config, model):
     return optimizer, scheduler
 
 
-def get_train_steps_per_epoch(dataset_len):
-    if args.DEBUG:
-        return 20
-    # add as argument
-    min_data_len = paddle.to_tensor(dataset_len)
-    from paddle.distributed import ReduceOp
-    dist.all_reduce(min_data_len, ReduceOp.MIN)
-    dataset_len = min_data_len.numpy()[0]
-    logging.info(f'min dataset len: {dataset_len}')
+def get_train_steps_per_epoch(dataset_len, args):
+    if args.distributed:
+        min_data_len = paddle.to_tensor(dataset_len)
+        from paddle.distributed import ReduceOp
+        dist.all_reduce(min_data_len, ReduceOp.MIN)
+        dataset_len = min_data_len.numpy()[0]
+        logging.info(f'min dataset len: {dataset_len}')
     return int(dataset_len / args.batch_size) - 5
 
 
@@ -135,7 +133,10 @@ def load_data(args, trainer_id, trainer_num, model_config, dataset_config, trans
     if args.DEBUG:
         train_npz_files = train_npz_files[:16]
         valid_npz_files = valid_npz_files[:8]
-    train_dataset = InMemoryDataset(npz_data_files=train_npz_files[trainer_id::trainer_num])
+    if args.inference:
+        train_dataset = []
+    else:
+        train_dataset = InMemoryDataset(npz_data_files=train_npz_files[trainer_id::trainer_num])
     valid_dataset = InMemoryDataset(npz_data_files=valid_npz_files[trainer_id::trainer_num]) 
     if model_config.data.get('post_transform', False):
         logging.info('post transform')
@@ -227,6 +228,24 @@ def evaluate(args, epoch_id, model, test_dataset, collate_fn):
     return mean_mae
 
 
+def adjust_dropout(model_config, encoder_config, last_ck_path):
+    """
+    adjust the dropout rate of the model to achieve better performance
+    """
+    encoder_config.init_dropout_rate = 0
+    encoder_config.optimus_block.first_body_axial_attention_dropout = 0
+    encoder_config.optimus_block.pair_dropout_rate = 0
+    encoder_config.optimus_block.first_body_axial_attention.dropout_rate = 0
+    encoder_config.optimus_block.node_ffn.dropout_rate = 0
+    encoder_config.optimus_block.second_body_first_axis.dropout_rate = 0
+    encoder_config.optimus_block.second_body_second_axis.dropout_rate = 0
+    encoder_config.optimus_block.pair_ffn.dropout_rate = 0
+    model = MolRegressionModel(model_config, encoder_config)
+    model.set_state_dict(paddle.load(last_ck_path))
+    print('Load state_dict from %s' % last_ck_path)
+    return model
+
+
 def main(args):
     """
     Call the configuration function of the model, build the model and load data, then start training.
@@ -284,6 +303,10 @@ def _read_json(path):
     ema_start_step = 0 if args.DEBUG else 30
 
     optimizer, scheduler = get_optimizer(args, train_config, model)
+    if args.inference:
+        mean_mae = evaluate(args, 0, model, valid_dataset, collate_fn)
+        print(f"mean mae : {mean_mae}")
+        exit(0)
 
     ### start train
     data_writer = None
@@ -292,7 +315,7 @@ def _read_json(path):
             data_writer = SummaryWriter(f'{args.log_dir}/tensorboard_log_dir', max_queue=0)
         except Exception as ex:
             print(f'Create data_writer failed: {ex}')
-    train_steps = get_train_steps_per_epoch(len(train_dataset))
+    train_steps = get_train_steps_per_epoch(len(train_dataset), args)
     print("train_steps per epoch : ", train_steps)
     mean_mae_list = []
     for _ in range(args.start_step):
@@ -301,6 +324,9 @@ def _read_json(path):
         ## ema register
         if epoch_id >= ema_start_step and not ema.is_registered:
             ema.register()
+        
+        if epoch_id == 69:
+            model = adjust_dropout(model_config, encoder_config, f'./{args.model_dir}/epoch_{epoch_id - 1}.pdparams')
 
         ## train
         s_time = time.time()
@@ -362,6 +388,7 @@ def _read_json(path):
     parser.add_argument("--model_config", type=str)
     parser.add_argument("--encoder_config", type=str)
     parser.add_argument("--train_config", type=str)
+    parser.add_argument("--inference", action='store_true', default=False)
     parser.add_argument("--init_model", type=str)
     parser.add_argument("--start_step", type=int)
     parser.add_argument("--model_dir", type=str, default="./debug_models")

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ train_pcqm(){`
`22`	`22`
`23`	`23`	`}`
`24`	`24`
`25`		`-`
	`25`	`+exp_name="gem2_l12_c256_4e-4"`
`26`	`26`	`echo "$exp_name"`
`27`	`27`
`28`	`28`