From cb65f839e4ed9f407237856e58d39646fee03d10 Mon Sep 17 00:00:00 2001 From: Brian Christian Date: Fri, 22 Aug 2025 10:20:48 -0700 Subject: [PATCH] remove trailing whitespace --- README.md | 14 +++++------ reward_models/grm_reward_trainer.py | 14 +++++------ reward_models/grm_utils.py | 14 +++++------ reward_models/load_datasets.py | 24 +++++++++---------- reward_models/reward_trainer.py | 4 ++-- reward_models/run_grm_reward_train.py | 14 +++++------ reward_models/run_reward_models_train.py | 16 ++++++------- reward_models/utils.py | 2 +- rlhf/bon/README.md | 2 +- rlhf/bon/load_datasets.py | 20 ++++++++-------- ...step1_train_proxy_reward_model_baseline.py | 18 +++++++------- .../bon/step1_train_proxy_reward_model_grm.py | 14 +++++------ rlhf/bon/step2_generate_samples.py | 8 +++---- rlhf/bon/step3_obtain_proxy_score.py | 10 ++++---- rlhf/bon/step4_choose_best_of_n.py | 6 ++--- rlhf/bon/step5_obtain_bon_gold_score.py | 4 ++-- rlhf/bon/step6_collect.py | 16 ++++++------- rlhf/bon/utils.py | 4 ++-- rlhf/data_generation/load_datasets.py | 4 ++-- rlhf/data_generation/obtain_gold_score.py | 20 ++++++++-------- rlhf/data_generation/sample_dataset.py | 4 ++-- rlhf/data_generation/utils.py | 2 +- rlhf/ppo/config.py | 12 +++++----- rlhf/ppo/model_utils.py | 12 +++++----- rlhf/ppo/ppo.py | 21 ++++++++-------- rlhf/ppo/ppo_grm.py | 21 ++++++++-------- rlhf/ppo/ppo_rm_ensemble.py | 19 +++++++-------- rlhf/ppo/ppo_utils.py | 8 +++---- rlhf/ppo/rm_utils.py | 7 +++--- rm_eval/eval.py | 16 ++++++------- rm_eval/eval_grm.py | 14 +++++------ rm_eval/grm_utils.py | 16 ++++++------- rm_eval/load_eval_datasets.py | 14 +++++------ scripts/eval_bt_rm.sh | 4 ++-- scripts/eval_grm_rm.sh | 6 ++--- scripts/eval_grm_rm_full.sh | 6 ++--- ...step1_train_proxy_reward_model_baseline.sh | 2 +- scripts/rlhf/bon/step3_obtain_proxy_score.sh | 2 +- scripts/rlhf/bon/step6_collect.sh | 4 ++-- scripts/rlhf/data_generation4rlhf.sh | 8 +++---- scripts/rlhf/ppo/train_ppo.sh | 5 ++-- scripts/rlhf/ppo/train_ppo_ensemble.sh | 4 ++-- scripts/rlhf/ppo/train_ppo_grm.sh | 6 ++--- scripts/train_bt_rm_full.sh | 2 +- scripts/train_bt_rm_lora.sh | 2 +- scripts/train_grm_full.sh | 2 +- scripts/train_grm_lora.sh | 2 +- 47 files changed, 222 insertions(+), 227 deletions(-) diff --git a/README.md b/README.md index 45f4253..75e2849 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ Check out our GRM series below, which are evlauated on [reward-bench](https://hu -| Model | Average | Chat | Chat Hard | Safety | Reasoning | +| Model | Average | Chat | Chat Hard | Safety | Reasoning | |:-------------------------:|:-------------:|:---------:|:---------:|:--------:|:-----------:| -|[GRM_Llama3.1_8B_rewardmodel-ft](https://huggingface.co/Ray2333/GRM_Llama3.1_8B_rewardmodel-ft)**(8B)**| 92.6|95.0 |87.7|91.4|96.4| +|[GRM_Llama3.1_8B_rewardmodel-ft](https://huggingface.co/Ray2333/GRM_Llama3.1_8B_rewardmodel-ft)**(8B)**| 92.6|95.0 |87.7|91.4|96.4| |[GRM-Llama3-8B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-Llama3-8B-rewardmodel-ft)**(8B)**|91.5|95.5|86.2|90.8|93.6| |[GRM-Llama3.2-3B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-Llama3.2-3B-rewardmodel-ft)**(3B)**|90.9|91.6|84.9|92.7|94.6| | [GRM-gemma2-2B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-gemma2-2B-rewardmodel-ft) **(2B)**| 88.4 | 93.0 | 77.2 | 92.2 | 91.2 | @@ -22,8 +22,8 @@ Check out our GRM series below, which are evlauated on [reward-bench](https://hu |[GRM-Gemma-2B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-Gemma-2B-rewardmodel-ft) **(2B)**| 84.7 | 89.4 | 75.2 | 85.5 | 88.8 | | [GRM-Gemma2-2B-sftreg](https://huggingface.co/Ray2333/GRM-Gemma2-2B-sftreg)**(2B)** | 81.0 | 97.2 | 59.6 | 86.9 | 80.3 | | openai/gpt-4o-2024-05-13 | 84.6| 96.6 | 70.4 | 86.5 | 84.9 | -| [GRM-Gemma-2B-sftreg](https://huggingface.co/Ray2333/GRM-Gemma-2B-sftreg)**(2B)** | 75.3 | 95.5 | 48.7 | 80.0 | 76.8 | -| [Gemma-2B-rewardmodel-baseline](https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-baseline)**(2B)** | 73.7 | 94.1 | 46.1 | 79.6 | 75.0 | +| [GRM-Gemma-2B-sftreg](https://huggingface.co/Ray2333/GRM-Gemma-2B-sftreg)**(2B)** | 75.3 | 95.5 | 48.7 | 80.0 | 76.8 | +| [Gemma-2B-rewardmodel-baseline](https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-baseline)**(2B)** | 73.7 | 94.1 | 46.1 | 79.6 | 75.0 | @@ -35,7 +35,7 @@ We also evaluated the GRM series using [PPE](https://github.com/lmarena/PPE/tree |[GRM-llama3-8B-sftreg](https://huggingface.co/Ray2333/GRM-llama3-8B-sftreg)**(8B)**| 62.7 | 66.6 | 60.4| 55.6| 70.9| 59.5 | 63.4| |[GRM-Llama3-8B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-Llama3-8B-rewardmodel-ft)**(8B)**| 61.4 | 64.2 | 59.6 | 56.2 | 72.3 | 53.3 | 62.5 | |[GRM-llama3.2-3B-sftreg](https://huggingface.co/Ray2333/GRM-llama3.2-3B-sftreg)**(3B)**| 61.3 |63.9 |58.7 | 55.6| 74.7| 53.1 | 62.0 | -| ArmoRM-Llama3-8B-v0.1 | 61.2 | 66.5 | 58.4 | 57.0 | 70.7 | 54.2 | 60.6| +| ArmoRM-Llama3-8B-v0.1 | 61.2 | 66.5 | 58.4 | 57.0 | 70.7 | 54.2 | 60.6| |Skywork-Reward-Llama-3.1-8B | 61.0 | 64.3 | 61.5 | 56.5 | 69.7 | 51.6 | 62.4| |Nemotron-4-340B-Reward | 60.4| 69.7 | 62.7 | 56.6 | 65.1 | 49.2 | 59.3 | |[GRM-Llama3.2-3B-rewardmodel-ft](https://huggingface.co/Ray2333/GRM-Llama3.2-3B-rewardmodel-ft)**(3B)**| 59.2 | 62.2 | 57.4 | 56.1 | 72.4 | 46.2 | 60.8 | @@ -44,7 +44,7 @@ We also evaluated the GRM series using [PPE](https://github.com/lmarena/PPE/tree -## Usage +## Usage First set the environment variable. ``` export HF_HOME='your HF token' @@ -118,6 +118,6 @@ sh train_ppo_ensemble.sh ``` ## Acknowledgment -This repo is built upon [transformers](https://github.com/huggingface/transformers) and [trl](https://github.com/huggingface/trl), with also inspiration from [RLHFlow](https://github.com/RLHFlow/RLHF-Reward-Modeling). +This repo is built upon [transformers](https://github.com/huggingface/transformers) and [trl](https://github.com/huggingface/trl), with also inspiration from [RLHFlow](https://github.com/RLHFlow/RLHF-Reward-Modeling). diff --git a/reward_models/grm_reward_trainer.py b/reward_models/grm_reward_trainer.py index 5901366..6b53afa 100644 --- a/reward_models/grm_reward_trainer.py +++ b/reward_models/grm_reward_trainer.py @@ -51,18 +51,18 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: label_rejected_paded = torch.tensor(feature["label_rejected"].tolist() + [self.label_pad_token_id] * (paded_length - len(feature["label_rejected"])) , dtype=torch.int64) label_paded.extend([label_chosen_paded.view(1, -1), label_rejected_paded.view(1, -1)]) label_paded = torch.concatenate(label_paded, dim=0) - + batch = { "input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"], "return_loss": True, - "label": label_paded, + "label": label_paded, } return batch -class GRMRewardTrainer(RewardTrainer): +class GRMRewardTrainer(RewardTrainer): def __init__(self, **kwargs): self.reference_free = kwargs.pop('reference_free', True) self.reference_model = kwargs.pop('reference_model', None) @@ -77,7 +77,7 @@ def __init__(self, **kwargs): def get_batch_logps( - self, + self, logits: torch.FloatTensor, labels: torch.LongTensor, average_log_prob: bool = False, @@ -104,10 +104,10 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N if not self.reference_free: with torch.no_grad(): ref_logits = self.reference_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])[0] - + bsz = rewards.size(0) jidx = torch.arange(0, bsz, 2) # chosen_ids - kidx = jidx + 1 # rejected_ids + kidx = jidx + 1 # rejected_ids reward_loss = -nn.functional.logsigmoid(rewards[jidx] - rewards[kidx]).mean() ## text-generation regularization @@ -121,7 +121,7 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N else: dpo_loss = -F.logsigmoid(self.beta * (pi_logratios)).mean() else: - pi_logratios = logps[jidx] - logps[kidx] + pi_logratios = logps[jidx] - logps[kidx] if self.reference_free or self.sft_only: ref_logratios = torch.tensor(0.0) else: diff --git a/reward_models/grm_utils.py b/reward_models/grm_utils.py index ee305a3..beffeec 100644 --- a/reward_models/grm_utils.py +++ b/reward_models/grm_utils.py @@ -57,7 +57,7 @@ def __init__(self, config, **kwargs): for i in range(num_layers): module_lis.extend([nn.Linear(input_neurons, num_neurons), nn.ReLU()]) input_neurons = num_neurons - + module_lis.append(nn.Linear(num_neurons, num_output)) self.summary = nn.Sequential(*module_lis) self.flatten = nn.Flatten() @@ -137,7 +137,7 @@ def forward( last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device) elif not hasattr(self.v_head.summary, 'weight') and (last_hidden_state.device != self.v_head.summary[0].weight.device): last_hidden_state = last_hidden_state.to(self.v_head.summary[0].weight.device) - + # use the last token value as reward last_index = attention_mask.sum(dim=-1) - 1 value = self.v_head(last_hidden_state).squeeze(-1)[torch.arange(len(last_hidden_state)), last_index] @@ -164,7 +164,7 @@ def push_to_hub(self, *args, **kwargs): setattr(self.pretrained_model, "v_head", self.v_head) return self.pretrained_model.push_to_hub(*args, **kwargs) - + def post_init(self, state_dict): r""" @@ -203,7 +203,7 @@ def set_device_hook(module, input, outputs): self.register_forward_hook(set_device_hook) self.is_sequential_parallel = True - + @classmethod def register_for_auto_class(cls, auto_class="AutoModel"): if not isinstance(auto_class, str): @@ -234,7 +234,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ if 'Mistral' not in model_name: model_config['attn_implementation'] = "flash_attention_2" - + model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, **model_config) model.pretrained_model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id @@ -255,7 +255,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ loaded_state_dict = torch.load(os.path.join(peft_name, "pytorch_model.bin")) missing, unexpected = model.base_model.model.pretrained_model.load_state_dict(loaded_state_dict, strict=False) missing, unexpected = model.base_model.model.load_state_dict(loaded_state_dict, strict=False) - + if hasattr(model, 'merge_and_unload'): model = model.merge_and_unload() return model @@ -266,7 +266,7 @@ def model_withhead_forward(model, input_ids, attention_mask, device, forward_typ elif forward_type == 'dpo': res = model(input_ids.to(device), attention_mask=attention_mask.to(device)) if len(res) == 3: - logits, _, _ = res + logits, _, _ = res else: logits = res.logits if logits.shape[:-1] != labels.shape: diff --git a/reward_models/load_datasets.py b/reward_models/load_datasets.py index 5375ef3..a7c9d8e 100644 --- a/reward_models/load_datasets.py +++ b/reward_models/load_datasets.py @@ -7,7 +7,7 @@ # for vanilla chosen and reject style dataset, such as dendrydong/preference_700K def build_dataset(data_path, tokenizer, split='train', size=None, model_name=''): ds = load_dataset(data_path, split=split) - + if size is not None: ds = ds.select(range(0, size)) @@ -40,7 +40,7 @@ def formatting_func(example): "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0], } - ds = ds.map(formatting_func, batched=False, num_proc=10) + ds = ds.map(formatting_func, batched=False, num_proc=10) remove_columns = [] for col in ds.column_names: if 'input' not in col and 'attention' not in col and 'label' not in col: @@ -57,15 +57,15 @@ def build_dataset_UF(data_path, tokenizer, split='train', size=None, mode='', mo ds = load_dataset(data_path, 'all', split=split) except: ds = load_dataset(data_path, split=split) - + # filter data with the same rating ds = ds.filter(lambda example: example['conv_A_rating'] != example['conv_B_rating'], num_proc=30) if len(mode): if mode == '40k' or mode == '40K': - ds = ds.select(range(0, len(ds), 20)) + ds = ds.select(range(0, len(ds), 20)) elif mode == '400k' or mode == '400K': - ds = ds.select(range(0, len(ds), 2)) + ds = ds.select(range(0, len(ds), 2)) if size is not None: ds = ds.select(range(0, size)) @@ -80,11 +80,11 @@ def formatting_func(example): chosen_messages = example['conv_B'] rejected_messages = example['conv_A'] margin = example['conv_B_rating'] - example['conv_A_rating'] - + if 'summarize' in example['source']: chosen_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + chosen_messages[0]['content'].strip() rejected_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + rejected_messages[0]['content'].strip() - + prompt_plus_chosen_response = tokenizer.apply_chat_template(chosen_messages, tokenize=False) prompt_plus_rejected_response = tokenizer.apply_chat_template(rejected_messages, tokenize=False) tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs) @@ -108,9 +108,9 @@ def formatting_func(example): return { "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0], "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0], - "margin": margin, + "margin": margin, } - + ds = ds.map(formatting_func, batched=False, num_proc=10) # ds = ds.filter(lambda x: len(x["input_ids_chosen"]) <= script_args.max_length and len(x["input_ids_rejected"]) <= script_args.max_length, num_proc=30) @@ -161,7 +161,7 @@ def formatting_func(example): "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0], } - ds = ds.map(formatting_func, batched=False, num_proc=10) + ds = ds.map(formatting_func, batched=False, num_proc=10) ds.set_format(type="torch") return ds @@ -169,14 +169,14 @@ def formatting_func(example): def load_train_eval_dataset(data_path, tokenizer, size=None, mode='', model_name=''): if 'Unified' in data_path: # mode is only used for loading training data - train_dataset = build_dataset_UF(data_path, tokenizer, split='train', size=size, mode=mode, model_name=model_name) + train_dataset = build_dataset_UF(data_path, tokenizer, split='train', size=size, mode=mode, model_name=model_name) eval_dataset = build_dataset_UF(data_path, tokenizer, split='val', model_name=model_name) elif 'Skywork' in data_path: dataset = build_dataset_SK(data_path, tokenizer, split='train', size=size, model_name=model_name) dataset_split = dataset.train_test_split(test_size=0.005) train_dataset, eval_dataset = dataset_split['train'], dataset_split['test'] else: - dataset = build_dataset(data_path, tokenizer, split='train', size=size, model_name=model_name) + dataset = build_dataset(data_path, tokenizer, split='train', size=size, model_name=model_name) dataset_split = dataset.train_test_split(test_size=0.01) train_dataset, eval_dataset = dataset_split['train'], dataset_split['test'] return train_dataset, eval_dataset \ No newline at end of file diff --git a/reward_models/reward_trainer.py b/reward_models/reward_trainer.py index 18335f9..24bb69c 100644 --- a/reward_models/reward_trainer.py +++ b/reward_models/reward_trainer.py @@ -68,13 +68,13 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N rewards_k = rewards[kidx] if self.loss_type == 'bt': - loss = - nn.functional.logsigmoid(rewards_j - rewards_k).mean() + loss = - nn.functional.logsigmoid(rewards_j - rewards_k).mean() elif self.loss_type == 'pos_reg': loss = - nn.functional.logsigmoid(rewards_j - rewards_k).mean() - self.weight_ratio * nn.functional.logsigmoid(rewards_j.mean()) elif self.loss_type == 'margin': loss = -nn.functional.logsigmoid(rewards_j - rewards_k - torch.tensor(inputs["margin"], device=inputs["margin"][0].device).view(-1,1)).mean() elif self.loss_type == 'labelsmooth': - loss = - (1-self.weight_ratio) * nn.functional.logsigmoid(rewards_j - rewards_k).mean() - self.weight_ratio * nn.functional.logsigmoid(rewards_k - rewards_j).mean() + loss = - (1-self.weight_ratio) * nn.functional.logsigmoid(rewards_j - rewards_k).mean() - self.weight_ratio * nn.functional.logsigmoid(rewards_k - rewards_j).mean() else: raise NotImplementedError diff --git a/reward_models/run_grm_reward_train.py b/reward_models/run_grm_reward_train.py index 4d001c8..bc19229 100644 --- a/reward_models/run_grm_reward_train.py +++ b/reward_models/run_grm_reward_train.py @@ -23,13 +23,13 @@ @dataclass class ScriptArguments: # training args - per_device_train_batch_size: Optional[int] = field(default=1) + per_device_train_batch_size: Optional[int] = field(default=1) gradient_accumulation_steps: Optional[int] = field(default=16) learning_rate: Optional[float] = field(default=1e-5) num_train_epochs: Optional[int] = field(default=2, metadata={"help": "The number of training epochs for the reward model."}) optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."}) lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "The lr scheduler"},) - max_length: Optional[int] = field(default=1024) + max_length: Optional[int] = field(default=1024) gradient_checkpointing: Optional[bool] = field(default=True) bf16: Optional[bool] = field(default=True) attn_implementation: Optional[str] = field(default="flash_attention_2") @@ -64,9 +64,9 @@ class ScriptArguments: reference_free: Optional[bool] = field(default=True) sft_only: Optional[bool] = field(default=True) no_logsigmoid_sft: Optional[bool] = field(default=False) - - + + parser = HfArgumentParser(ScriptArguments) @@ -77,7 +77,7 @@ class ScriptArguments: else: output_name = f"{script_args.log_dir}/{model_name_split}_{script_args.wandb_name}_len{script_args.max_length}_fulltrain_{script_args.learning_rate}_data{script_args.dataset.split('/')[-1]}" -device = Accelerator().local_process_index +device = Accelerator().local_process_index training_args = TrainingArguments( output_dir=os.path.join(output_name, 'logs'), @@ -90,7 +90,7 @@ class ScriptArguments: save_strategy=script_args.save_strategy, save_steps=script_args.save_steps, gradient_accumulation_steps=script_args.gradient_accumulation_steps, - gradient_checkpointing=script_args.gradient_checkpointing, + gradient_checkpointing=script_args.gradient_checkpointing, bf16=script_args.bf16, logging_strategy="steps", logging_steps=10, @@ -136,7 +136,7 @@ class ScriptArguments: model = AutoModelForCausalLMWithValueHead.from_pretrained( - script_args.base_model, device_map=device, + script_args.base_model, device_map=device, torch_dtype=torch.bfloat16, **model_params, ) diff --git a/reward_models/run_reward_models_train.py b/reward_models/run_reward_models_train.py index dc0e995..2b71860 100644 --- a/reward_models/run_reward_models_train.py +++ b/reward_models/run_reward_models_train.py @@ -21,13 +21,13 @@ @dataclass class ScriptArguments: # training args - per_device_train_batch_size: Optional[int] = field(default=1) + per_device_train_batch_size: Optional[int] = field(default=1) gradient_accumulation_steps: Optional[int] = field(default=16) learning_rate: Optional[float] = field(default=1e-5) num_train_epochs: Optional[int] = field(default=2, metadata={"help": "The number of training epochs for the reward model."}) optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."}) lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "The lr scheduler"},) - max_length: Optional[int] = field(default=1024) + max_length: Optional[int] = field(default=1024) gradient_checkpointing: Optional[bool] = field(default=True) bf16: Optional[bool] = field(default=True) attn_implementation: Optional[str] = field(default="flash_attention_2") @@ -56,7 +56,7 @@ class ScriptArguments: save_strategy: Optional[str] = field(default="epoch") save_steps: Optional[int] = field(default=1000) debug: Optional[bool] = field(default=False, metadata={'help': 'if debug=True, only train with 100 samples'}) - + parser = HfArgumentParser(ScriptArguments) @@ -67,7 +67,7 @@ class ScriptArguments: else: output_name = f"{script_args.log_dir}/{model_name_split}_{script_args.wandb_name}_len{script_args.max_length}_fulltrain_{script_args.learning_rate}_data{script_args.dataset.split('/')[-1]}" -device = Accelerator().local_process_index +device = Accelerator().local_process_index training_args = TrainingArguments( output_dir=os.path.join(output_name, 'logs'), @@ -80,7 +80,7 @@ class ScriptArguments: save_strategy=script_args.save_strategy, save_steps=script_args.save_steps, gradient_accumulation_steps=script_args.gradient_accumulation_steps, - gradient_checkpointing=script_args.gradient_checkpointing, + gradient_checkpointing=script_args.gradient_checkpointing, bf16=script_args.bf16, logging_strategy="steps", logging_steps=10, @@ -117,7 +117,7 @@ class ScriptArguments: model_params = {} model = AutoModelForSequenceClassification.from_pretrained( - script_args.base_model, num_labels=1, device_map=device, + script_args.base_model, num_labels=1, device_map=device, torch_dtype=torch.bfloat16, **model_params ) @@ -125,9 +125,9 @@ class ScriptArguments: if script_args.freeze_pretrained: # for frozon baseline mlp_layer = nn.Sequential( - nn.Linear(model.config.hidden_size, 1024, dtype=torch.bfloat16), + nn.Linear(model.config.hidden_size, 1024, dtype=torch.bfloat16), nn.ReLU(), - nn.Linear(1024, 1, dtype=torch.bfloat16) + nn.Linear(1024, 1, dtype=torch.bfloat16) ) mlp_layer.to(device) # Replace the classifier with the MLP diff --git a/reward_models/utils.py b/reward_models/utils.py index e4c3a7d..c76378a 100644 --- a/reward_models/utils.py +++ b/reward_models/utils.py @@ -34,7 +34,7 @@ def compute_metrics(eval_pred): def grm_compute_metrics(eval_pred): rewards = eval_pred.label_ids reward_accuracy = (rewards[:, 0] > rewards[:, 1]).mean() - + predictions = eval_pred.predictions accuracy = (predictions[:, 0] > predictions[:, 1]).mean() return { diff --git a/rlhf/bon/README.md b/rlhf/bon/README.md index 7f029f7..438609c 100644 --- a/rlhf/bon/README.md +++ b/rlhf/bon/README.md @@ -24,7 +24,7 @@ The trained proxy model is applied to each of the generated responses, assigning ### Step 4: Select Best-of-N Responses -Using the proxy scores from Step 3, we select the single best response out of the $N$ generated responses for each prompt. The highest-scoring response is chosen as the “best-of-N” according to the proxy model. +Using the proxy scores from Step 3, we select the single best response out of the $N$ generated responses for each prompt. The highest-scoring response is chosen as the “best-of-N” according to the proxy model. **Note**: To apply ensemble methods: (1) train multiple models in Step 1 using different random seeds; (2) perform inference with each model in Step 3; and (3) aggregate the proxy scores using methods like `min` or `avg` before proceeding to Step 4. diff --git a/rlhf/bon/load_datasets.py b/rlhf/bon/load_datasets.py index 3788226..ace51a1 100644 --- a/rlhf/bon/load_datasets.py +++ b/rlhf/bon/load_datasets.py @@ -9,7 +9,7 @@ def load_train_eval_dataset(data_path, tokenizer, size=None, model_name=''): - train_dataset = build_dataset_UF(data_path, tokenizer, split='train', size=size, model_name=model_name) + train_dataset = build_dataset_UF(data_path, tokenizer, split='train', size=size, model_name=model_name) eval_dataset = build_dataset_UF(data_path, tokenizer, split='test', model_name=model_name) return train_dataset, eval_dataset @@ -17,7 +17,7 @@ def load_train_eval_dataset(data_path, tokenizer, size=None, model_name=''): # for UnifiedFeedback def build_dataset_UF(data_path, tokenizer, split='train', size=None, model_name=''): - + ds = load_dataset(data_path, split=split) # filter data with the same rating ds = ds.filter(lambda example: example['conv_A_rating'] != example['conv_B_rating'], num_proc=30) @@ -35,11 +35,11 @@ def formatting_func(example): chosen_messages = example['conv_B'] rejected_messages = example['conv_A'] margin = example['conv_B_rating'] - example['conv_A_rating'] - + if 'summarize' in example['source']: chosen_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + chosen_messages[0]['content'].strip() rejected_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + rejected_messages[0]['content'].strip() - + prompt_plus_chosen_response = tokenizer.apply_chat_template(chosen_messages, tokenize=False) prompt_plus_rejected_response = tokenizer.apply_chat_template(rejected_messages, tokenize=False) tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs) @@ -63,9 +63,9 @@ def formatting_func(example): return { "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0], "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0], - "margin": margin, + "margin": margin, } - + ds = ds.map(formatting_func, batched=False, num_proc=10) # ds = ds.filter(lambda x: len(x["input_ids_chosen"]) <= script_args.max_length and len(x["input_ids_rejected"]) <= script_args.max_length, num_proc=30) @@ -84,18 +84,18 @@ def build_datasets_inference(data_path, tokenizer, split='', size=None, max_leng ds = load_dataset(data_path, split=split) if size is not None: ds = ds.select(range(size)) - + def formatting_func(example): kwargs = {"padding": 'max_length', "truncation": True, "max_length": max_length, "return_tensors": "pt"} - + if not isinstance(example['output'], str): answer = '' else: answer = example['output'] - + messages = [{"role": "user", "content": example['input']}, {"role": "assistant", "content": answer}] - + prompt_plus_response = tokenizer.apply_chat_template(messages, tokenize=False) tokens = tokenizer.encode_plus(prompt_plus_response, **kwargs) diff --git a/rlhf/bon/step1_train_proxy_reward_model_baseline.py b/rlhf/bon/step1_train_proxy_reward_model_baseline.py index eb95e01..1a84793 100644 --- a/rlhf/bon/step1_train_proxy_reward_model_baseline.py +++ b/rlhf/bon/step1_train_proxy_reward_model_baseline.py @@ -26,19 +26,19 @@ @dataclass class ScriptArguments: # training args - per_device_train_batch_size: Optional[int] = field(default=1) + per_device_train_batch_size: Optional[int] = field(default=1) gradient_accumulation_steps: Optional[int] = field(default=16) learning_rate: Optional[float] = field(default=1e-5) num_train_epochs: Optional[int] = field(default=2, metadata={"help": "The number of training epochs for the reward model."}) optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."}) lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "The lr scheduler"},) - max_length: Optional[int] = field(default=1024) + max_length: Optional[int] = field(default=1024) gradient_checkpointing: Optional[bool] = field(default=True) bf16: Optional[bool] = field(default=True) attn_implementation: Optional[str] = field(default="flash_attention_2") # data dataset: Optional[str] = field(default='rlhf/bon/step1_obtain_gold_score/unified_sampled_gold_score') - + # lora use_lora: Optional[bool] = field(default=True) lora_target_modules: Optional[List[str]] = field(default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"]) @@ -61,7 +61,7 @@ class ScriptArguments: save_strategy: Optional[str] = field(default="epoch") save_steps: Optional[int] = field(default=1000) debug: Optional[bool] = field(default=False, metadata={'help': 'if debug=True, only train with 100 samples'}) - + parser = HfArgumentParser(ScriptArguments) @@ -72,7 +72,7 @@ class ScriptArguments: else: output_name = f"{script_args.log_dir}/{model_name_split}_{script_args.wandb_name}_len{script_args.max_length}_fulltrain_{script_args.learning_rate}_data{script_args.dataset.split('/')[-1]}" -device = Accelerator().local_process_index +device = Accelerator().local_process_index training_args = TrainingArguments( output_dir=os.path.join(output_name, 'logs'), @@ -85,7 +85,7 @@ class ScriptArguments: save_strategy=script_args.save_strategy, save_steps=script_args.save_steps, gradient_accumulation_steps=script_args.gradient_accumulation_steps, - gradient_checkpointing=script_args.gradient_checkpointing, + gradient_checkpointing=script_args.gradient_checkpointing, bf16=script_args.bf16, logging_strategy="steps", logging_steps=10, @@ -122,7 +122,7 @@ class ScriptArguments: model_params = {} model = AutoModelForSequenceClassification.from_pretrained( - script_args.base_model, num_labels=1, device_map=device, + script_args.base_model, num_labels=1, device_map=device, torch_dtype=torch.bfloat16, **model_params ) @@ -130,9 +130,9 @@ class ScriptArguments: if script_args.freeze_pretrained: # for frozon baseline mlp_layer = nn.Sequential( - nn.Linear(model.config.hidden_size, 1024, dtype=torch.bfloat16), + nn.Linear(model.config.hidden_size, 1024, dtype=torch.bfloat16), nn.ReLU(), - nn.Linear(1024, 1, dtype=torch.bfloat16) + nn.Linear(1024, 1, dtype=torch.bfloat16) ) mlp_layer.to(device) # Replace the classifier with the MLP diff --git a/rlhf/bon/step1_train_proxy_reward_model_grm.py b/rlhf/bon/step1_train_proxy_reward_model_grm.py index c8b6e3a..d6f51cd 100644 --- a/rlhf/bon/step1_train_proxy_reward_model_grm.py +++ b/rlhf/bon/step1_train_proxy_reward_model_grm.py @@ -27,13 +27,13 @@ @dataclass class ScriptArguments: # training args - per_device_train_batch_size: Optional[int] = field(default=1) + per_device_train_batch_size: Optional[int] = field(default=1) gradient_accumulation_steps: Optional[int] = field(default=16) learning_rate: Optional[float] = field(default=1e-5) num_train_epochs: Optional[int] = field(default=2, metadata={"help": "The number of training epochs for the reward model."}) optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."}) lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "The lr scheduler"},) - max_length: Optional[int] = field(default=1024) + max_length: Optional[int] = field(default=1024) gradient_checkpointing: Optional[bool] = field(default=True) bf16: Optional[bool] = field(default=True) attn_implementation: Optional[str] = field(default="flash_attention_2") @@ -68,9 +68,9 @@ class ScriptArguments: reference_free: Optional[bool] = field(default=True) sft_only: Optional[bool] = field(default=True) no_logsigmoid_sft: Optional[bool] = field(default=False) - - + + parser = HfArgumentParser(ScriptArguments) @@ -81,7 +81,7 @@ class ScriptArguments: else: output_name = f"{script_args.log_dir}/{model_name_split}_{script_args.wandb_name}_len{script_args.max_length}_fulltrain_{script_args.learning_rate}_data{script_args.dataset.split('/')[-1]}" -device = Accelerator().local_process_index +device = Accelerator().local_process_index training_args = TrainingArguments( output_dir=os.path.join(output_name, 'logs'), @@ -94,7 +94,7 @@ class ScriptArguments: save_strategy=script_args.save_strategy, save_steps=script_args.save_steps, gradient_accumulation_steps=script_args.gradient_accumulation_steps, - gradient_checkpointing=script_args.gradient_checkpointing, + gradient_checkpointing=script_args.gradient_checkpointing, bf16=script_args.bf16, logging_strategy="steps", logging_steps=10, @@ -142,7 +142,7 @@ class ScriptArguments: model = AutoModelForCausalLMWithValueHead.from_pretrained( - script_args.base_model, device_map=device, + script_args.base_model, device_map=device, torch_dtype=torch.bfloat16, **model_params, ) diff --git a/rlhf/bon/step2_generate_samples.py b/rlhf/bon/step2_generate_samples.py index 4f21a18..0c6f858 100644 --- a/rlhf/bon/step2_generate_samples.py +++ b/rlhf/bon/step2_generate_samples.py @@ -50,7 +50,7 @@ def generate_samples(): # Initialize Accelerator accelerator = Accelerator() - device = Accelerator().local_process_index + device = Accelerator().local_process_index # Create output directory output_dir = create_output_directory(script_args.save_path, script_args.save_name) @@ -66,7 +66,7 @@ def generate_samples(): # Load and process dataset dataset = load_data2generate(script_args.data_path, tokenizer, script_args.N, script_args.debug) print('Size of Total Dataset: %s'%(len(dataset))) - + # Prepare dataset with accelerator total_size = len(dataset) chunk_size = ceil(total_size / accelerator.num_processes) @@ -90,12 +90,12 @@ def generate_samples(): **prompts, max_new_tokens=script_args.max_new_tokens, pad_token_id=tokenizer.eos_token_id, - do_sample=True, + do_sample=True, top_k=0.0, temperature=0.7, top_p=0.95 ) - + # Remove prompt from generated tokens outputs = [tok_out[len(tok_in):] for tok_in, tok_out in zip(prompts["input_ids"], outputs)] decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) diff --git a/rlhf/bon/step3_obtain_proxy_score.py b/rlhf/bon/step3_obtain_proxy_score.py index c036ac6..26aab3a 100644 --- a/rlhf/bon/step3_obtain_proxy_score.py +++ b/rlhf/bon/step3_obtain_proxy_score.py @@ -74,7 +74,7 @@ def obtain_proxy_score(): if script_args.debug: dataset = dataset.select(range(0,40)) print('Size of Dataset: %s'%(len(dataset))) - + sampler = DistributedSampler(dataset, num_replicas=accelerator.num_processes, rank=accelerator.local_process_index, shuffle=False) data_loader = prepare_data_loader(dataset, tokenizer, script_args.per_device_batch_size, sampler=sampler, collate_fn_type='custom') # data_loader = accelerator.prepare(data_loader) @@ -96,11 +96,11 @@ def obtain_proxy_score(): full_prompts, full_rewards, full_source_ids, full_id_ids = [], [], [], [] pbar = tqdm(total=len(data_loader) * script_args.per_device_batch_size // accelerator.num_processes) device = accelerator.local_process_index - + with torch.no_grad(): for batch in data_loader: if script_args.model_type == 'grm': - reward_tensors = model_withhead_forward(model, batch["input_ids"], batch["attention_mask"], device, forward_type='reward') + reward_tensors = model_withhead_forward(model, batch["input_ids"], batch["attention_mask"], device, forward_type='reward') else: reward_tensors = model(batch["input_ids"].to(device), attention_mask=batch["attention_mask"].to(device)).logits.reshape(-1) @@ -109,13 +109,13 @@ def obtain_proxy_score(): full_source_ids.extend(batch['source']) full_id_ids.extend(batch['id']) pbar.update(1) - + full_prompts = [x.rstrip(tokenizer.pad_token) for x in tokenizer.batch_decode(full_prompts)] full_rewards = [float(x) for x in full_rewards] # full_source_ids = full_source_ids # full_id_ids = full_id_ids - + accelerator.wait_for_everyone() # Gather results from all processes all_prompts = accelerator.gather_for_metrics(full_prompts) diff --git a/rlhf/bon/step4_choose_best_of_n.py b/rlhf/bon/step4_choose_best_of_n.py index b77970a..6b6deac 100644 --- a/rlhf/bon/step4_choose_best_of_n.py +++ b/rlhf/bon/step4_choose_best_of_n.py @@ -54,19 +54,19 @@ def processing(method, filtered_results, data_df, output_dir): grouped_max_scores = data_df.groupby('id_ids').apply(get_highest_within_n, n) grouped_max_scores['N'] = n all_grouped_scores.append(grouped_max_scores) - + # Concatenate all DataFrames final_df = pd.concat(all_grouped_scores).reset_index(drop=True) # Apply processing to 'prompts' field new_columns = final_df['prompts'].apply(process_row) final_df = pd.concat([final_df, new_columns], axis=1) final_df.rename(columns={'id_ids': 'id', 'source_ids': 'source'}, inplace=True) - + # Save results to CSV save_path = os.path.join(output_dir, f'bon_selected_proxy_{method}.csv') final_df.to_csv(save_path, index=False) print(f"Saved results to {save_path}") - + # Save deduplicated results dedup_df = final_df.drop_duplicates(subset=['id', 'order']) dedup_save_path = os.path.join(output_dir, f'bon_selected_proxy_{method}_drop_duplicates') diff --git a/rlhf/bon/step5_obtain_bon_gold_score.py b/rlhf/bon/step5_obtain_bon_gold_score.py index ba809ba..5d60661 100644 --- a/rlhf/bon/step5_obtain_bon_gold_score.py +++ b/rlhf/bon/step5_obtain_bon_gold_score.py @@ -47,7 +47,7 @@ def evaluate_and_collect_results(model, data_loader, tokenizer, accelerator, bat """Evaluate and return results.""" full_prompts, full_rewards, full_source_ids, full_id_ids, full_order_ids = [], [], [], [], [] pbar = tqdm(total=len(data_loader) * batch_size // accelerator.num_processes) - + with torch.no_grad(): for i, batch in enumerate(data_loader): reward_tensors = model(batch["input_ids"].to(model.device), attention_mask=batch["attention_mask"].to(model.device)).logits.reshape(-1) @@ -109,7 +109,7 @@ def obtain_bon_gold_score(): # Run evaluation and gather results evaluation_result = evaluate_and_collect_results(model, data_loader, tokenizer, accelerator, script_args.per_device_batch_size) - + # Save results to CSV if accelerator.is_main_process: df = pd.DataFrame(evaluation_result) diff --git a/rlhf/bon/step6_collect.py b/rlhf/bon/step6_collect.py index 0b41d58..0f2dfb4 100644 --- a/rlhf/bon/step6_collect.py +++ b/rlhf/bon/step6_collect.py @@ -13,7 +13,7 @@ class ScriptArguments: proxy_score_path: str = field(default='./step4_choose_best_of_n/gemma-2b-it/grm/proxy_score.csv', metadata={'help': 'Path to the proxy score CSV'}) gold_score_path: str = field(default='./step5_obtain_bon_gold_score/gemma-2b-it/grm/gold_score.csv', metadata={'help': 'Path to the gold score CSV'}) output_path: str = field(default='./step6_collect/gemma-2b-it/grm', metadata={'help': 'Path to save the output CSV'}) - + n_values_start: Optional[int] = field(default=1, metadata={"help": "Starting value of N range to consider."}) n_values_end: Optional[int] = field(default=406, metadata={"help": "Ending value of N range to consider."}) kl_min: Optional[float] = field(default=0.0, metadata={"help": "Minimum KL value for filtering."}) @@ -24,7 +24,7 @@ def parse_args() -> ScriptArguments: parser = argparse.ArgumentParser(description="Process KL values and calculate average gold scores.") for field_name, field_def in ScriptArguments.__dataclass_fields__.items(): parser.add_argument(f"--{field_name}", type=type(field_def.default), default=field_def.default, help=field_def.metadata['help']) - + args = parser.parse_args() return ScriptArguments(**vars(args)) @@ -39,17 +39,17 @@ def process_gold_scores(filtered_results, df_gold_score, df_proxy_score): print('Processing for N =', n) # Filter df_bon by specific N df_n = df_proxy_score[df_proxy_score['N'] == n] - + # Match and collect gold scores matched_gold_scores = [] for _, bon_row in df_n.iterrows(): id_value, order_value = bon_row['id'], bon_row['order'] - gold_score = df_gold_score[(df_gold_score['id_ids'] == id_value) & + gold_score = df_gold_score[(df_gold_score['id_ids'] == id_value) & (df_gold_score['order_ids'] == order_value)]['gold_rewards'].values matched_gold_scores.append(gold_score[0]) # Calculate and store average score - avg_gold_score = np.mean(matched_gold_scores) + avg_gold_score = np.mean(matched_gold_scores) best_of_n_score_list.append(avg_gold_score) filtered_results['avg_gold_score'] = best_of_n_score_list @@ -64,11 +64,11 @@ def processing_proxy_scores(filtered_results, df_proxy_score): print('Processing for N =', n) # Filter df_bon by specific N df_n = df_proxy_score[df_proxy_score['N'] == n] - + proxy_scores_list = [] # Iterate over each row in df_n for _, row in df_n.iterrows(): - proxy_score = row['rewards'] + proxy_score = row['rewards'] # Collect all matching gold_score records proxy_scores_list.append(proxy_score) @@ -78,7 +78,7 @@ def processing_proxy_scores(filtered_results, df_proxy_score): filtered_results['avg_proxy_score'] = best_of_n_score_list return filtered_results - + def collect(): # Parse arguments diff --git a/rlhf/bon/utils.py b/rlhf/bon/utils.py index 4d2bc43..5a90395 100644 --- a/rlhf/bon/utils.py +++ b/rlhf/bon/utils.py @@ -39,7 +39,7 @@ def compute_metrics(eval_pred): def grm_compute_metrics(eval_pred): rewards = eval_pred.label_ids reward_accuracy = (rewards[:, 0] > rewards[:, 1]).mean() - + predictions = eval_pred.predictions accuracy = (predictions[:, 0] > predictions[:, 1]).mean() return { @@ -81,7 +81,7 @@ def create_output_directory(log_dir: str, wandb_name: str): def save_results_in_parquet_splits(results, num_splits, save_path, mode='test'): results_df = pd.DataFrame(results) dataset_with_results = Dataset.from_pandas(results_df) - + split_size = len(dataset_with_results) // num_splits for i in range(num_splits): start = i * split_size diff --git a/rlhf/data_generation/load_datasets.py b/rlhf/data_generation/load_datasets.py index 2f43046..cae78a0 100644 --- a/rlhf/data_generation/load_datasets.py +++ b/rlhf/data_generation/load_datasets.py @@ -44,14 +44,14 @@ def build_dataset_UF4gold_score(data_path, tokenizer, split='', size=None, max_l def formatting_func(example): kwargs = {"padding": 'max_length', "truncation": True, "max_length": max_length, "return_tensors": "pt"} example['source_id'] = source_dict[example['source']] - + chosen_messages = example['conv_A'] rejected_messages = example['conv_B'] if 'summarize' in example['source']: chosen_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + chosen_messages[0]['content'].strip() rejected_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + rejected_messages[0]['content'].strip() - + prompt_plus_chosen_response = tokenizer.apply_chat_template(chosen_messages, tokenize=False) prompt_plus_rejected_response = tokenizer.apply_chat_template(rejected_messages, tokenize=False) tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs) diff --git a/rlhf/data_generation/obtain_gold_score.py b/rlhf/data_generation/obtain_gold_score.py index cac64a5..c6794bb 100644 --- a/rlhf/data_generation/obtain_gold_score.py +++ b/rlhf/data_generation/obtain_gold_score.py @@ -29,7 +29,7 @@ class ScriptArguments: mode: Optional[str] = field(default="train", metadata={"help": "'train', and 'test'"}) num_splits: int = field(default=1, metadata={"help": "Number of splits for saving results"}) debug: Optional[bool] = field(default=False) - + def parse_args() -> ScriptArguments: parser = argparse.ArgumentParser(description="Set parameters for model training & evaluation.") @@ -49,7 +49,7 @@ def obtain_gold_score(script_args): # Initialize Accelerator accelerator = Accelerator() - device = Accelerator().local_process_index + device = Accelerator().local_process_index print('Curent Device', device) print('Number of processes:', accelerator.num_processes) @@ -63,14 +63,14 @@ def obtain_gold_score(script_args): model = AutoModelForSequenceClassification.from_pretrained(script_args.model_path, num_labels=1, device_map=device, torch_dtype=torch.bfloat16) # model.resize_token_embeddings(len(tokenizer)) # model.config.pad_token_id = tokenizer.pad_token_id - + # Prepare dataset and DataLoader dataset = build_dataset_UF4gold_score(script_args.data_path, tokenizer, split=script_args.mode, max_length=script_args.max_length) - + if script_args.debug: dataset = dataset.select(range(0,100)) print('Size of %s Dataset: %s'%(script_args.mode, len(dataset))) - + # Shard the dataset among processes sampler = DistributedSampler(dataset, num_replicas=accelerator.num_processes, rank=accelerator.local_process_index, shuffle=False) data_loader = prepare_data_loader(dataset, tokenizer, script_args.per_device_batch_size, sampler=sampler) @@ -97,7 +97,7 @@ def obtain_gold_score(script_args): full_unique_ids.extend(batch['unique_id']) if accelerator.is_main_process: pbar.update(1) - + full_chosen_prompts = [x.rstrip(tokenizer.pad_token) for x in tokenizer.batch_decode(full_chosen_prompts)] full_rejected_prompts = [x.rstrip(tokenizer.pad_token) for x in tokenizer.batch_decode(full_rejected_prompts)] @@ -108,14 +108,14 @@ def obtain_gold_score(script_args): # print(f'Process {accelerator.local_process_index} processed {len(full_chosen_prompts)} prompts') accelerator.wait_for_everyone() - + all_chosen_prompts = accelerator.gather_for_metrics(full_chosen_prompts) all_rejected_prompts = accelerator.gather_for_metrics(full_rejected_prompts) all_rewards_chosen = accelerator.gather_for_metrics(full_rewards_chosen) all_rewards_rejected = accelerator.gather_for_metrics(full_rewards_rejected) if 'unique_id' in batch.keys(): all_unique_ids = accelerator.gather_for_metrics(full_unique_ids) - + if accelerator.is_main_process: evaluation_result = { 'prompts_A': all_chosen_prompts, @@ -139,7 +139,7 @@ def replace_with_gold_reward(example): example['conv_A_rating'] = matching_row.iloc[0]['rewards_A'] example['conv_B_rating'] = matching_row.iloc[0]['rewards_B'] return example - + # Apply the replacement function to the dataset tokenizer = AutoTokenizer.from_pretrained(script_args.model_path, use_fast=False) dataset_prepared = load_dataset_within_maxlength(script_args.data_path, tokenizer, split=script_args.mode, max_length=script_args.max_length) @@ -151,7 +151,7 @@ def replace_with_gold_reward(example): dataset_gold_score = dataset_gold_score.remove_columns(['unique_id']) save_results_in_parquet_splits(dataset_gold_score, num_splits=script_args.num_splits, save_path=output_dir, mode=script_args.mode) - + if __name__ == "__main__": script_args = parse_args() diff --git a/rlhf/data_generation/sample_dataset.py b/rlhf/data_generation/sample_dataset.py index f57dc6e..bb55d5d 100644 --- a/rlhf/data_generation/sample_dataset.py +++ b/rlhf/data_generation/sample_dataset.py @@ -21,7 +21,7 @@ class ScriptArguments: save_name: Optional[str] = field(default='unified_sampled', metadata={"help": "Dataset Name."}) num_splits: int = field(default=1, metadata={"help": "Number of splits for saving results"}) debug: Optional[bool] = field(default=False) - + def parse_args() -> ScriptArguments: parser = argparse.ArgumentParser(description="Set parameters for model training & evaluation.") @@ -35,7 +35,7 @@ def parse_args() -> ScriptArguments: args = parser.parse_args() return ScriptArguments(**vars(args)) - + script_args = parse_args() # Load the dataset diff --git a/rlhf/data_generation/utils.py b/rlhf/data_generation/utils.py index 9e9624d..2336ed4 100644 --- a/rlhf/data_generation/utils.py +++ b/rlhf/data_generation/utils.py @@ -20,7 +20,7 @@ def create_output_directory(log_dir: str, wandb_name: str): def save_results_in_parquet_splits(results, num_splits, save_path, mode='test'): results_df = pd.DataFrame(results) dataset_with_results = Dataset.from_pandas(results_df) - + split_size = len(dataset_with_results) // num_splits for i in range(num_splits): start = i * split_size diff --git a/rlhf/ppo/config.py b/rlhf/ppo/config.py index cd23658..54a088a 100644 --- a/rlhf/ppo/config.py +++ b/rlhf/ppo/config.py @@ -1,11 +1,11 @@ -### default shared configs +### default shared configs from peft import LoraConfig def get_config(tokenizer): lora_config = LoraConfig( - r=32, - lora_alpha=64, + r=32, + lora_alpha=64, lora_dropout=0.05, target_modules=["q_proj","k_proj","v_proj","o_proj"], bias="none", @@ -14,9 +14,9 @@ def get_config(tokenizer): generation_kwargs = { "max_new_tokens": 512, - 'min_length': -1, + 'min_length': -1, "top_k": 0.0, - "top_p": 0.9, + "top_p": 0.9, "do_sample": True, "temperature": 0.7, "pad_token_id": tokenizer.eos_token_id, @@ -25,7 +25,7 @@ def get_config(tokenizer): eval_generation_kwargs = { "max_new_tokens": 512, - 'min_length': -1, + 'min_length': -1, "do_sample": False, "pad_token_id": tokenizer.eos_token_id, "begin_suppress_tokens": [tokenizer.eos_token_id], diff --git a/rlhf/ppo/model_utils.py b/rlhf/ppo/model_utils.py index 43bedf6..4e2e2da 100644 --- a/rlhf/ppo/model_utils.py +++ b/rlhf/ppo/model_utils.py @@ -90,7 +90,7 @@ def __init__(self, config, **kwargs): for i in range(num_layers): module_lis.extend([nn.Linear(input_neurons, num_neurons), nn.ReLU()]) input_neurons = num_neurons - + module_lis.append(nn.Linear(num_neurons, 1)) self.summary = nn.Sequential(*module_lis) self.flatten = nn.Flatten() @@ -231,7 +231,7 @@ def forward( last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device) elif not hasattr(self.v_head.summary, 'weight') and (last_hidden_state.device != self.v_head.summary[0].weight.device): last_hidden_state = last_hidden_state.to(self.v_head.summary[0].weight.device) - + # use the last token value as reward last_index = attention_mask.sum(dim=-1) - 1 value = self.v_head(last_hidden_state).squeeze(-1)[torch.arange(len(last_hidden_state)), last_index] @@ -261,7 +261,7 @@ def state_dict(self, *args, **kwargs): Returns the state dictionary of the model. We add the state dictionary of the value head to the state dictionary of the wrapped model by prepending the key with `v_head.`. """ - ### return lora + ### return lora pretrained_model_state_dict = self.pretrained_model.state_dict(*args, **kwargs).copy() v_head_state_dict = self.v_head.state_dict(*args, **kwargs).copy() @@ -276,7 +276,7 @@ def push_to_hub(self, *args, **kwargs): return self.pretrained_model.push_to_hub(*args, **kwargs) - + def post_init(self, state_dict): r""" @@ -334,7 +334,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ if 'Mistral' not in model_name: model_config['attn_implementation'] = "flash_attention_2" - + model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, **model_config) model.config.pad_token_id = tokenizer.pad_token_id @@ -359,7 +359,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ missing, unexpected = model.base_model.model.pretrained_model.load_state_dict(loaded_state_dict, strict=False) missing, unexpected = model.base_model.model.load_state_dict(loaded_state_dict, strict=False) - + if hasattr(model, 'merge_and_unload'): model = model.merge_and_unload() return model diff --git a/rlhf/ppo/ppo.py b/rlhf/ppo/ppo.py index 4556946..959a39b 100644 --- a/rlhf/ppo/ppo.py +++ b/rlhf/ppo/ppo.py @@ -7,7 +7,7 @@ from transformers import HfArgumentParser, AutoTokenizer from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed import numpy as np -import pandas as pd +import pandas as pd tqdm.pandas() from ppo_utils import print_trainable_parameters, collator, eval_model, build_dataset_unified, transfer_template_rm, plot_curve from rm_utils import load_reward_model @@ -42,16 +42,16 @@ class ScriptArguments: parser = HfArgumentParser(ScriptArguments) script_args = parser.parse_args_into_dataclasses()[0] -# Remember to use a merged sft model if using lora +# Remember to use a merged sft model if using lora base_model_name = script_args.base_model_name tokenier_name = script_args.base_model_name print('base model: ', base_model_name) if script_args.disable_wandb: # if you don't need the wandb log - os.environ['WANDB_DISABLED'] = 'true' + os.environ['WANDB_DISABLED'] = 'true' accelerator = Accelerator() -gpu_id= Accelerator().local_process_index +gpu_id= Accelerator().local_process_index set_seed(8888) print('process: {}'.format(gpu_id)) if accelerator.is_main_process and not os.path.exists(os.path.join(script_args.log_dir, script_args.wandb_name)): @@ -85,7 +85,7 @@ class ScriptArguments: eval_dataset = eval_dataset.select(range(40)) print(f"Size of the train set: {len(train_dataset)}, eval set: {len(eval_dataset)}") -# load fixed configs +# load fixed configs lora_config, generation_kwargs, eval_generation_kwargs = get_config(tokenizer) model_params = { "torch_dtype": torch.bfloat16, @@ -133,12 +133,12 @@ class ScriptArguments: query_tensors = batch["input_ids"] with torch.no_grad(): - response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs) + response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs) full_responses = tokenizer.batch_decode(response_tensors) lengths = [len(x) for x in full_responses] batch['response'] = full_responses - + # Compute score kwargs = {"padding": 'max_length', "truncation": True, "max_length": script_args.max_length, "return_tensors": "pt"} if tokenizer.chat_template == rm_tokenizer.chat_template: @@ -147,11 +147,11 @@ class ScriptArguments: # changing template for different reward model and base model temp_lis = [(transfer_template_rm(query, response, tokenizer, rm_tokenizer)) for query, response in zip(batch['query'], batch['response'])] encoded_prompt_response = [rm_tokenizer.encode_plus(query + response, **kwargs) for query, response in temp_lis] - + with torch.no_grad(): - reward_tensors = [reward_model(x['input_ids'].to(rm_gpu_id)).logits[0] for x in encoded_prompt_response] + reward_tensors = [reward_model(x['input_ids'].to(rm_gpu_id)).logits[0] for x in encoded_prompt_response] rewards = [r.item() for r in reward_tensors] - + # normalize using the first batch statistics if script_args.normalize_rewards: if epoch == 0 and i == 0: @@ -208,4 +208,3 @@ class ScriptArguments: save_path = os.path.join(script_args.log_dir, script_args.wandb_name, name) ppo_trainer.save_pretrained(save_path) print("iter {}, batch {}: model saved".format(epoch, i)) - \ No newline at end of file diff --git a/rlhf/ppo/ppo_grm.py b/rlhf/ppo/ppo_grm.py index c6486e7..1a693dd 100644 --- a/rlhf/ppo/ppo_grm.py +++ b/rlhf/ppo/ppo_grm.py @@ -7,7 +7,7 @@ from transformers import HfArgumentParser, AutoTokenizer from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed import numpy as np -import pandas as pd +import pandas as pd tqdm.pandas() from ppo_utils import print_trainable_parameters, collator, eval_model, build_dataset_unified, transfer_template_rm, plot_curve from rm_utils import load_reward_model @@ -45,16 +45,16 @@ class ScriptArguments: parser = HfArgumentParser(ScriptArguments) script_args = parser.parse_args_into_dataclasses()[0] -# Remember to use a merged sft model if using lora +# Remember to use a merged sft model if using lora base_model_name = script_args.base_model_name tokenier_name = script_args.base_model_name print('base model: ', base_model_name) if script_args.disable_wandb: # if you don't need the wandb log - os.environ['WANDB_DISABLED'] = 'true' + os.environ['WANDB_DISABLED'] = 'true' accelerator = Accelerator() -gpu_id= Accelerator().local_process_index +gpu_id= Accelerator().local_process_index set_seed(8888) print('process: {}'.format(gpu_id)) if accelerator.is_main_process and not os.path.exists(os.path.join(script_args.log_dir, script_args.wandb_name)): @@ -88,7 +88,7 @@ class ScriptArguments: eval_dataset = eval_dataset.select(range(40)) print(f"Size of the train set: {len(train_dataset)}, eval set: {len(eval_dataset)}") -# load fixed configs +# load fixed configs lora_config, generation_kwargs, eval_generation_kwargs = get_config(tokenizer) model_params = { "torch_dtype": torch.bfloat16, @@ -136,12 +136,12 @@ class ScriptArguments: query_tensors = batch["input_ids"] with torch.no_grad(): - response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs) + response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs) full_responses = tokenizer.batch_decode(response_tensors) lengths = [len(x) for x in full_responses] batch['response'] = full_responses - + # Compute score kwargs = {"padding": 'max_length', "truncation": True, "max_length": script_args.max_length, "return_tensors": "pt"} if tokenizer.chat_template == rm_tokenizer.chat_template: @@ -150,11 +150,11 @@ class ScriptArguments: # changing template for different reward model and base model temp_lis = [(transfer_template_rm(query, response, tokenizer, rm_tokenizer)) for query, response in zip(batch['query'], batch['response'])] encoded_prompt_response = [rm_tokenizer.encode_plus(query + response, **kwargs) for query, response in temp_lis] - + with torch.no_grad(): - reward_tensors = [model_withhead_forward(reward_model, x['input_ids'], x["attention_mask"], device=rm_gpu_id) for x in encoded_prompt_response] + reward_tensors = [model_withhead_forward(reward_model, x['input_ids'], x["attention_mask"], device=rm_gpu_id) for x in encoded_prompt_response] rewards = [r.item() for r in reward_tensors] - + # normalize using the first batch statistics if script_args.normalize_rewards: if epoch == 0 and i == 0: @@ -211,4 +211,3 @@ class ScriptArguments: save_path = os.path.join(script_args.log_dir, script_args.wandb_name, name) ppo_trainer.save_pretrained(save_path) print("iter {}, batch {}: model saved".format(epoch, i)) - \ No newline at end of file diff --git a/rlhf/ppo/ppo_rm_ensemble.py b/rlhf/ppo/ppo_rm_ensemble.py index 68d1804..fe1487a 100644 --- a/rlhf/ppo/ppo_rm_ensemble.py +++ b/rlhf/ppo/ppo_rm_ensemble.py @@ -7,7 +7,7 @@ from transformers import HfArgumentParser, AutoTokenizer from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed import numpy as np -import pandas as pd +import pandas as pd tqdm.pandas() from ppo_utils import print_trainable_parameters, collator, eval_model, build_dataset_unified, transfer_template_rm, plot_curve from rm_utils import load_reward_model, RMEnsemble @@ -43,15 +43,15 @@ class ScriptArguments: parser = HfArgumentParser(ScriptArguments) script_args = parser.parse_args_into_dataclasses()[0] -# Remember to use a merged sft model if using lora +# Remember to use a merged sft model if using lora base_model_name = script_args.base_model_name print('base model: ', base_model_name) if script_args.disable_wandb: # if you don't need the wandb log - os.environ['WANDB_DISABLED'] = 'true' + os.environ['WANDB_DISABLED'] = 'true' accelerator = Accelerator() -gpu_id= Accelerator().local_process_index +gpu_id= Accelerator().local_process_index set_seed(8888) print('process: {}'.format(gpu_id)) if accelerator.is_main_process and not os.path.exists(os.path.join(script_args.log_dir, script_args.wandb_name)): @@ -87,7 +87,7 @@ class ScriptArguments: eval_dataset = eval_dataset.select(range(40)) print(f"Size of the train set: {len(train_dataset)}, eval set: {len(eval_dataset)}") -# load fixed configs +# load fixed configs lora_config, generation_kwargs, eval_generation_kwargs = get_config(tokenizer) model_params = { "torch_dtype": torch.bfloat16, @@ -135,12 +135,12 @@ class ScriptArguments: query_tensors = batch["input_ids"] with torch.no_grad(): - response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs) + response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs) full_responses = tokenizer.batch_decode(response_tensors) lengths = [len(x) for x in full_responses] batch['response'] = full_responses - + # Compute score kwargs = {"padding": 'max_length', "truncation": True, "max_length": script_args.max_length, "return_tensors": "pt"} if tokenizer.chat_template == reward_models.rm_tokenizers[0].chat_template: @@ -149,11 +149,11 @@ class ScriptArguments: # changing template for different reward model and base model temp_lis = [(transfer_template_rm(query, response, tokenizer, reward_models.rm_tokenizers[0])) for query, response in zip(batch['query'], batch['response'])] encoded_prompt_response = [reward_models.rm_tokenizers[0].encode_plus(query + response, **kwargs) for query, response in temp_lis] - + with torch.no_grad(): reward_tensors = reward_models.forward(encoded_prompt_response) rewards = [r.item() for r in reward_tensors] - + # normalize using the first batch statistics if script_args.normalize_rewards: if epoch == 0 and i == 0: @@ -210,4 +210,3 @@ class ScriptArguments: save_path = os.path.join(script_args.log_dir, script_args.wandb_name, name) ppo_trainer.save_pretrained(save_path) print("iter {}, batch {}: model saved".format(epoch, i)) - \ No newline at end of file diff --git a/rlhf/ppo/ppo_utils.py b/rlhf/ppo/ppo_utils.py index 28c9454..b46848d 100644 --- a/rlhf/ppo/ppo_utils.py +++ b/rlhf/ppo/ppo_utils.py @@ -4,7 +4,7 @@ from tqdm import tqdm import datasets import numpy as np -import pandas as pd +import pandas as pd tqdm.pandas() import matplotlib.pyplot as plt @@ -39,7 +39,7 @@ def eval_model(ppo_trainer, eval_dataset, tokenizer, accelerator, script_args, n full_response_tensors = [] kl1_list, kl2_list, kl3_list = [], [], [] full_source_ids, full_id_ids = [], [] - + eval_data_loader = DataLoader(eval_dataset, batch_size=script_args.eval_batch_size, drop_last=False, collate_fn=collator) eval_data_loader = accelerator.prepare(eval_data_loader) @@ -47,7 +47,7 @@ def eval_model(ppo_trainer, eval_dataset, tokenizer, accelerator, script_args, n with torch.no_grad(): for i, batch in enumerate(eval_data_loader): query_tensors = batch['input_ids'] - response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **eval_generation_kwargs) + response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **eval_generation_kwargs) full_response_tensors.extend(response_tensors) full_prompts.extend(batch['input_ids']) @@ -135,7 +135,7 @@ def transfer_template_rm(prompt, response, tokenizer, rm_tokenizer): {'content': reply, 'role': 'assistant'}] ) else: - query = res[0] + query = res[0] query = query.replace("\n", '') messages.append( {'content': query, 'role': 'user'}, diff --git a/rlhf/ppo/rm_utils.py b/rlhf/ppo/rm_utils.py index 565a550..99514fa 100644 --- a/rlhf/ppo/rm_utils.py +++ b/rlhf/ppo/rm_utils.py @@ -9,7 +9,7 @@ from transformers import HfArgumentParser, AutoModelForSequenceClassification, AutoTokenizer from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed import numpy as np -import pandas as pd +import pandas as pd tqdm.pandas() from peft import LoraConfig, PeftModel import matplotlib.pyplot as plt @@ -77,12 +77,12 @@ def load_reward_models(self, script_args, gpu_id): self.gpu_ids.append(rm_gpu_id) self.rm_tokenizers.append(rm_tokenizer) - + def forward(self, encoded_prompt_response): results = [] with torch.no_grad(): for i in range(len(self.peft_path_list)): - reward_tensors = [self.reward_models[i](x['input_ids'].to(self.gpu_ids[i])).logits[0] for x in encoded_prompt_response] + reward_tensors = [self.reward_models[i](x['input_ids'].to(self.gpu_ids[i])).logits[0] for x in encoded_prompt_response] results.append(torch.concat(reward_tensors).view(-1, 1)) if self.ensemble_method == 'avg': @@ -92,4 +92,3 @@ def forward(self, encoded_prompt_response): else: raise NotImplementedError return reward_tensors - \ No newline at end of file diff --git a/rm_eval/eval.py b/rm_eval/eval.py index f7b2c8e..b846bd6 100644 --- a/rm_eval/eval.py +++ b/rm_eval/eval.py @@ -22,7 +22,7 @@ @dataclass class ScriptArguments: per_device_eval_batch_size: Optional[int] = field(default=8) - max_length: Optional[int] = field(default=1024) + max_length: Optional[int] = field(default=1024) base_model: Optional[str] = field(default="google/gemma-2b-it") peft_name: Optional[str] = field(default="gemma-2b-it_reward_unified_0.5datasset_bs1_lora32_len1024_1epoch_1e-05/checkpoint") log_dir: Optional[str] = field(default='./eval_unified_reward_models') @@ -34,7 +34,7 @@ class ScriptArguments: script_args = parser.parse_args_into_dataclasses()[0] accelerator = Accelerator() -device = Accelerator().local_process_index +device = Accelerator().local_process_index model_name = script_args.base_model log_path = os.path.join(script_args.log_dir, model_name.split('/')[-1], script_args.task) @@ -57,7 +57,7 @@ class ScriptArguments: ###### load model model = AutoModelForSequenceClassification.from_pretrained( model_name, - num_labels=1, device_map=device, + num_labels=1, device_map=device, torch_dtype=torch.float16, ) @@ -66,7 +66,7 @@ class ScriptArguments: print('loading freeze nonlinear parameters') tensors = {} path_list = glob.glob(os.path.join(script_args.peft_name, "model-*.safetensors")) - + for path in path_list: with safe_open(path, framework="pt", device=0) as f: for k in f.keys(): @@ -75,15 +75,15 @@ class ScriptArguments: # use the same structure as the training mlp_layer = nn.Sequential( - nn.Linear(model.config.hidden_size, 1024, dtype=torch.float16), + nn.Linear(model.config.hidden_size, 1024, dtype=torch.float16), nn.ReLU(), - nn.Linear(1024, 1, dtype=torch.float16) + nn.Linear(1024, 1, dtype=torch.float16) ) mlp_layer.to(device) # Replace the classifier with the MLP model.score = mlp_layer model.load_state_dict(tensors, strict=False) - + model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id @@ -148,6 +148,6 @@ class ScriptArguments: dataframe.to_csv(os.path.join(log_path,'eval_data.csv')) with open(os.path.join(log_path,'accuracy.txt'), 'w+') as f: f.write(str(accuracy)) - + diff --git a/rm_eval/eval_grm.py b/rm_eval/eval_grm.py index 957c358..72c26e1 100644 --- a/rm_eval/eval_grm.py +++ b/rm_eval/eval_grm.py @@ -19,10 +19,10 @@ @dataclass class ScriptArguments: per_device_eval_batch_size: Optional[int] = field(default=8) - max_length: Optional[int] = field(default=1024) + max_length: Optional[int] = field(default=1024) base_model: Optional[str] = field(default="Ray2333/GRM-llama3-8B-sftreg") peft_name: Optional[str] = field(default='') - layer_type: Optional[str] = field(default='mlp') + layer_type: Optional[str] = field(default='mlp') num_layers: Optional[int] = field(default=1) log_dir: Optional[str] = field(default='./eval_reward_grm') task: Optional[Literal['unified', 'hhh', 'mtbench']] = field(default='unified') @@ -33,7 +33,7 @@ class ScriptArguments: script_args = parser.parse_args_into_dataclasses()[0] accelerator = Accelerator() -device = Accelerator().local_process_index +device = Accelerator().local_process_index model_name = script_args.base_model log_path = os.path.join(script_args.log_dir, model_name.split('/')[-1], script_args.task) @@ -71,8 +71,8 @@ class ScriptArguments: pbar = tqdm(total=len(eval_dataset) // script_args.per_device_eval_batch_size // accelerator.num_processes) with torch.no_grad(): for i, batch in enumerate(eval_data_loader): - reward_chosen_tensors = model_withhead_forward(model, batch["input_ids"], batch["attention_mask_chosen"], device, forward_type='reward') - reward_rejected_tensors = model_withhead_forward(model, batch["input_ids_rejected"], batch["attention_mask_rejected"], device, forward_type='reward') + reward_chosen_tensors = model_withhead_forward(model, batch["input_ids"], batch["attention_mask_chosen"], device, forward_type='reward') + reward_rejected_tensors = model_withhead_forward(model, batch["input_ids_rejected"], batch["attention_mask_rejected"], device, forward_type='reward') full_rewards_chosen.extend(reward_chosen_tensors) full_rewards_rejected.extend(reward_rejected_tensors) full_chosen_prompts.extend(batch['input_ids']) @@ -105,7 +105,7 @@ class ScriptArguments: 'rejected_prompts': all_rejected_prompts, 'chosen_rewards': all_rewards_chosen, 'rejected_rewards': all_rewards_rejected, - + } if 'source_id' in batch.keys(): evaluation_result['source_ids'] = all_source_ids @@ -118,6 +118,6 @@ class ScriptArguments: dataframe.to_csv(os.path.join(log_path, 'eval_data.csv')) with open(os.path.join(log_path,'accuracy.txt'), 'w+') as f: f.write(str(accuracy)) - + diff --git a/rm_eval/grm_utils.py b/rm_eval/grm_utils.py index 94dfed4..6979255 100644 --- a/rm_eval/grm_utils.py +++ b/rm_eval/grm_utils.py @@ -57,7 +57,7 @@ def __init__(self, config, **kwargs): for i in range(num_layers): module_lis.extend([nn.Linear(input_neurons, num_neurons), nn.ReLU()]) input_neurons = num_neurons - + module_lis.append(nn.Linear(num_neurons, num_output)) self.summary = nn.Sequential(*module_lis) self.flatten = nn.Flatten() @@ -137,7 +137,7 @@ def forward( last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device) elif not hasattr(self.v_head.summary, 'weight') and (last_hidden_state.device != self.v_head.summary[0].weight.device): last_hidden_state = last_hidden_state.to(self.v_head.summary[0].weight.device) - + # use the last token value as reward last_index = attention_mask.sum(dim=-1) - 1 value = self.v_head(last_hidden_state).squeeze(-1)[torch.arange(len(last_hidden_state)), last_index] @@ -164,7 +164,7 @@ def push_to_hub(self, *args, **kwargs): setattr(self.pretrained_model, "v_head", self.v_head) return self.pretrained_model.push_to_hub(*args, **kwargs) - + def post_init(self, state_dict): r""" @@ -203,7 +203,7 @@ def set_device_hook(module, input, outputs): self.register_forward_hook(set_device_hook) self.is_sequential_parallel = True - + @classmethod def register_for_auto_class(cls, auto_class="AutoModel"): if not isinstance(auto_class, str): @@ -279,7 +279,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ if 'Mistral' not in model_name: model_config['attn_implementation'] = "flash_attention_2" - + if not len(peft_name): model_config.pop('attn_implementation') model = GRewardModel.from_pretrained(model_name, **model_config) @@ -287,7 +287,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ else: model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, **model_config) model.pretrained_model.resize_token_embeddings(len(tokenizer)) - + model.config.pad_token_id = tokenizer.pad_token_id if len(peft_name) and os.path.exists(peft_name): peft_config = PeftConfig.from_pretrained(peft_name) @@ -306,7 +306,7 @@ def load_model_withhead(model_name, peft_name, tokenizer, device, \ loaded_state_dict = torch.load(os.path.join(peft_name, "pytorch_model.bin")) missing, unexpected = model.base_model.model.pretrained_model.load_state_dict(loaded_state_dict, strict=False) missing, unexpected = model.base_model.model.load_state_dict(loaded_state_dict, strict=False) - + if hasattr(model, 'merge_and_unload'): model = model.merge_and_unload() return model @@ -319,7 +319,7 @@ def model_withhead_forward(model, input_ids, attention_mask, device, forward_typ elif forward_type == 'dpo': res = model(input_ids.to(device), attention_mask=attention_mask.to(device)) if len(res) == 3: - logits, _, _ = res + logits, _, _ = res else: logits = res.logits if logits.shape[:-1] != labels.shape: diff --git a/rm_eval/load_eval_datasets.py b/rm_eval/load_eval_datasets.py index 0219ae6..51298e5 100644 --- a/rm_eval/load_eval_datasets.py +++ b/rm_eval/load_eval_datasets.py @@ -39,11 +39,11 @@ def formatting_func(example): else: chosen_messages = example['conv_B'] rejected_messages = example['conv_A'] - + if 'summarize' in example['source']: chosen_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + chosen_messages[0]['content'].strip() rejected_messages[0]['content'] = 'Generate one-sentence summary for the following post: ' + rejected_messages[0]['content'].strip() - + prompt_plus_chosen_response = tokenizer.apply_chat_template(chosen_messages, tokenize=False) prompt_plus_rejected_response = tokenizer.apply_chat_template(rejected_messages, tokenize=False) tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs) @@ -73,7 +73,7 @@ def build_ood_eval_dataset(data_path, tokenizer, split='test', size=None): new_ds = new_ds.add_column('source_id', [i] * len(new_ds)) if ds_tmp is None: ds_tmp = new_ds - + else: ds_tmp = concatenate_datasets([ds_tmp, new_ds]) ds = ds_tmp @@ -81,7 +81,7 @@ def build_ood_eval_dataset(data_path, tokenizer, split='test', size=None): ds_raw = load_dataset('lmsys/mt_bench_human_judgments') ds = concatenate_datasets([ ds_raw['human'].add_column('source_id', [0] * len(ds_raw['human'])), - ds_raw['gpt4_pair'].add_column('source_id', [1] * len(ds_raw['gpt4_pair'])), + ds_raw['gpt4_pair'].add_column('source_id', [1] * len(ds_raw['gpt4_pair'])), ]) else: ds = load_dataset(data_path, split=split) @@ -109,7 +109,7 @@ def formatting_func(example): {'role': 'user', 'content': human_msg}, {'role': 'assistant', 'content': assistant_msg}, ]) - else: # last + else: # last human_msg = lis.strip() chosen_messages.extend([ {'role': 'user', 'content': human_msg}, @@ -177,9 +177,9 @@ def load_eval_dataset(task, tokenizer, size=None): if 'hhh' in task: data_path = 'HuggingFaceH4/hhh_alignment' elif 'mt' in task: - data_path = 'lmsys/mt_bench_human_judgments' + data_path = 'lmsys/mt_bench_human_judgments' else: raise NotImplementedError - + eval_dataset = build_ood_eval_dataset(data_path, tokenizer, split='test', size=size) return eval_dataset \ No newline at end of file diff --git a/scripts/eval_bt_rm.sh b/scripts/eval_bt_rm.sh index d32ae93..70393d7 100644 --- a/scripts/eval_bt_rm.sh +++ b/scripts/eval_bt_rm.sh @@ -11,8 +11,8 @@ save_all_data=False freeze_pretrained=False # for freeze pretrained feature baseline cd ../rm_eval -for task in 'unified' 'hhh' 'mtbench' -do +for task in 'unified' 'hhh' 'mtbench' +do CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port ${port} eval.py \ --base_model ${base_model} --peft_name ${peft_name} \ --per_device_eval_batch_size ${per_device_eval_batch_size} \ diff --git a/scripts/eval_grm_rm.sh b/scripts/eval_grm_rm.sh index 5416c39..6b80264 100644 --- a/scripts/eval_grm_rm.sh +++ b/scripts/eval_grm_rm.sh @@ -11,12 +11,12 @@ log_dir='./eval_GRM' save_all_data=False cd ../rm_eval -for task in 'unified' 'hhh' 'mtbench' -do +for task in 'unified' 'hhh' 'mtbench' +do CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port ${port} eval_grm.py --base_model ${base_model} --peft_name ${peft_name} \ --per_device_eval_batch_size ${per_device_eval_batch_size} \ --max_length ${max_length} --log_dir ${log_dir} --save_all_data ${save_all_data} \ - --task ${task} --layer_type ${layer_type} --num_layers ${num_layers} + --task ${task} --layer_type ${layer_type} --num_layers ${num_layers} done diff --git a/scripts/eval_grm_rm_full.sh b/scripts/eval_grm_rm_full.sh index f6fa052..69d59ab 100644 --- a/scripts/eval_grm_rm_full.sh +++ b/scripts/eval_grm_rm_full.sh @@ -11,11 +11,11 @@ log_dir='./eval_GRM' save_all_data=False cd ../rm_eval -for task in 'unified' 'hhh' 'mtbench' -do +for task in 'unified' 'hhh' 'mtbench' +do CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port ${port} eval_grm.py --base_model ${base_model} --per_device_eval_batch_size ${per_device_eval_batch_size} \ --max_length ${max_length} --log_dir ${log_dir} --save_all_data ${save_all_data} \ - --task ${task} --layer_type ${layer_type} --num_layers ${num_layers} + --task ${task} --layer_type ${layer_type} --num_layers ${num_layers} done diff --git a/scripts/rlhf/bon/step1_train_proxy_reward_model_baseline.sh b/scripts/rlhf/bon/step1_train_proxy_reward_model_baseline.sh index 45dbc68..cc55c17 100644 --- a/scripts/rlhf/bon/step1_train_proxy_reward_model_baseline.sh +++ b/scripts/rlhf/bon/step1_train_proxy_reward_model_baseline.sh @@ -26,6 +26,6 @@ CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --mai --lora_r ${lora_r} --lora_alpha ${lora_alpha} \ --gradient_accumulation_steps ${gradient_accumulation_steps} \ --learning_rate ${learning_rate} --loss_type ${loss_type} \ - --dataset ${dataset_name} + --dataset ${dataset_name} diff --git a/scripts/rlhf/bon/step3_obtain_proxy_score.sh b/scripts/rlhf/bon/step3_obtain_proxy_score.sh index 74b2cf3..8312c5f 100644 --- a/scripts/rlhf/bon/step3_obtain_proxy_score.sh +++ b/scripts/rlhf/bon/step3_obtain_proxy_score.sh @@ -32,5 +32,5 @@ CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --mai - + diff --git a/scripts/rlhf/bon/step6_collect.sh b/scripts/rlhf/bon/step6_collect.sh index 3838824..61aece5 100644 --- a/scripts/rlhf/bon/step6_collect.sh +++ b/scripts/rlhf/bon/step6_collect.sh @@ -6,5 +6,5 @@ python rlhf/bon/step6_collect.py \ --gold_score_path 'rlhf/bon/step5_obtain_bon_gold_score/gemma-2b-it/grm/gold_score.csv' \ --output_path 'rlhf/bon/step6_collect/gemma-2b-it/grm' \ --n_values_start 1 \ - --n_values_end 406 - + --n_values_end 406 + diff --git a/scripts/rlhf/data_generation4rlhf.sh b/scripts/rlhf/data_generation4rlhf.sh index 6e8d97d..4c78820 100644 --- a/scripts/rlhf/data_generation4rlhf.sh +++ b/scripts/rlhf/data_generation4rlhf.sh @@ -13,7 +13,7 @@ python rlhf/data_generation/sample_dataset.py \ --test_size 1000 \ --save_path 'rlhf/data' \ --save_name 'unified_sampled' - + CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --main_process_port ${main_process_port} \ rlhf/data_generation/obtain_gold_score.py \ @@ -23,8 +23,8 @@ CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --mai --model_path "Ray2333/reward-model-Mistral-7B-instruct-Unified-Feedback" \ --save_path "rlhf/data" \ --save_name "unified_sampled_gold_score" \ - --mode "train" - + --mode "train" + CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --main_process_port ${main_process_port} \ rlhf/data_generation/obtain_gold_score.py \ --per_device_batch_size 16 \ @@ -33,4 +33,4 @@ CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --mai --model_path "Ray2333/reward-model-Mistral-7B-instruct-Unified-Feedback" \ --save_path "rlhf/data" \ --save_name "unified_sampled_gold_score" \ - --mode "test" \ No newline at end of file + --mode "test" \ No newline at end of file diff --git a/scripts/rlhf/ppo/train_ppo.sh b/scripts/rlhf/ppo/train_ppo.sh index 9b7f9f2..4496afc 100644 --- a/scripts/rlhf/ppo/train_ppo.sh +++ b/scripts/rlhf/ppo/train_ppo.sh @@ -9,11 +9,11 @@ eval_dataset_path="rlhf/data/unified_1k" # set the eval dataset cd ../../../ # 4 gpus for 2b rm -gpu=0,1,2,3 +gpu=0,1,2,3 num_processes=4 reward_base_model="google/gemma-2b-it" ### you need set this path -reward_peft_path='rlhf/save_reward_models/gemma-2b-it_BT_RM_seed2_len1024_lora32_1e-05_dataUnified-Feedback/logs/checkpoint-3536' +reward_peft_path='rlhf/save_reward_models/gemma-2b-it_BT_RM_seed2_len1024_lora32_1e-05_dataUnified-Feedback/logs/checkpoint-3536' wandb_name="ppo_rm2B_lr1e-5_klreg0.0_normrewards" CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 9989 --num_processes ${num_processes} rlhf/ppo/ppo.py \ --base_model_name ${base_model_name} \ @@ -47,4 +47,3 @@ CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 9989 --num_pro --normalize_rewards True \ --learning_rate 1e-5 \ - \ No newline at end of file diff --git a/scripts/rlhf/ppo/train_ppo_ensemble.sh b/scripts/rlhf/ppo/train_ppo_ensemble.sh index fd11eee..538899e 100644 --- a/scripts/rlhf/ppo/train_ppo_ensemble.sh +++ b/scripts/rlhf/ppo/train_ppo_ensemble.sh @@ -28,7 +28,7 @@ CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 9991 rlhf/ppo/ --eval_every ${eval_every} \ --ensemble_method ${ensemble_method} \ --normalize_rewards True \ - --learning_rate 1e-5 + --learning_rate 1e-5 @@ -49,7 +49,7 @@ CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 9998 rlhf/ppo/ - + diff --git a/scripts/rlhf/ppo/train_ppo_grm.sh b/scripts/rlhf/ppo/train_ppo_grm.sh index 6913695..b93d212 100644 --- a/scripts/rlhf/ppo/train_ppo_grm.sh +++ b/scripts/rlhf/ppo/train_ppo_grm.sh @@ -12,7 +12,7 @@ gpu=4,5,6,7 num_processes=1 reward_base_model="google/gemma-2b-it" ### you need set this path -reward_peft_path='save_reward_models/gemma-2b-it_GRM_seed1_len1024_lora32_1e-05_dataUnified-Feedback/logs/checkpoint-3536' +reward_peft_path='save_reward_models/gemma-2b-it_GRM_seed1_len1024_lora32_1e-05_dataUnified-Feedback/logs/checkpoint-3536' wandb_name="ppo_grm2B_lr1e-5_klreg0.0_normrewards" CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 10007 --num_processes ${num_processes} rlhf/ppo/ppo_grm.py \ --base_model_name ${base_model_name} \ @@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 10007 --num_pr --learning_rate 1e-5 \ --layer_type 'mlp' --num_layers 1 \ --debug False - + # # training 7B reward model requires 6 gpus and 4 process (other 2 gpus for reward inference) @@ -49,4 +49,4 @@ CUDA_VISIBLE_DEVICES=${gpu} accelerate launch --main_process_port 10007 --num_pr # --normalize_rewards True \ # --learning_rate 1e-5 \ # --layer_type 'mlp' --num_layers 1 \ - + diff --git a/scripts/train_bt_rm_full.sh b/scripts/train_bt_rm_full.sh index 3802c97..3a23aca 100644 --- a/scripts/train_bt_rm_full.sh +++ b/scripts/train_bt_rm_full.sh @@ -1,6 +1,6 @@ devices=0,1,2,3 n_gpu=4 -export NCCL_P2P_DISABLE=1 +export NCCL_P2P_DISABLE=1 # dataset_name='hendrydong/preference_700K' dataset_name='Skywork/Skywork-Reward-Preference-80K-v0.2' base_model='google/gemma-2b-it' diff --git a/scripts/train_bt_rm_lora.sh b/scripts/train_bt_rm_lora.sh index 4a85d04..eb31525 100644 --- a/scripts/train_bt_rm_lora.sh +++ b/scripts/train_bt_rm_lora.sh @@ -25,4 +25,4 @@ CUDA_VISIBLE_DEVICES=${devices} accelerate launch --num_processes ${n_gpu} --mai --lora_r ${lora_r} --lora_alpha ${lora_alpha} \ --gradient_accumulation_steps ${gradient_accumulation_steps} \ --learning_rate ${learning_rate} --loss_type ${loss_type} \ - --dataset ${dataset_name} --dataset_mode ${dataset_mode} \ No newline at end of file + --dataset ${dataset_name} --dataset_mode ${dataset_mode} \ No newline at end of file diff --git a/scripts/train_grm_full.sh b/scripts/train_grm_full.sh index 3e3dc73..bc1c63a 100644 --- a/scripts/train_grm_full.sh +++ b/scripts/train_grm_full.sh @@ -7,7 +7,7 @@ log_dir='../save_reward_models' main_process_port=9994 learning_rate=5e-6 -max_length=3000 +max_length=3000 num_train_epochs=1 gradient_accumulation_steps=64 diff --git a/scripts/train_grm_lora.sh b/scripts/train_grm_lora.sh index 2331efd..51770de 100644 --- a/scripts/train_grm_lora.sh +++ b/scripts/train_grm_lora.sh @@ -15,7 +15,7 @@ num_train_epochs=2 gradient_accumulation_steps=4 weight_ratio=0.01 -layer_type='mlp' +layer_type='mlp' sft_only=True reference_free=True