diff --git a/turbo_alignment/cli/train.py b/turbo_alignment/cli/train.py index 314a85c..af38656 100755 --- a/turbo_alignment/cli/train.py +++ b/turbo_alignment/cli/train.py @@ -130,18 +130,18 @@ def reinforce_training( experiment_settings = pipeline_settings.REINFORCETrainExperimentSettings.parse_file(experiment_settings_path) - policy_models = RayGroup(num_nodes=2, num_gpus_per_node=8, ray_actor_type=pipelines.TrainREINFORCEStrategy) + policy_models = RayGroup(num_nodes=experiment_settings.trainer_settings.num_nodes, num_gpus_per_node=8, ray_actor_type=pipelines.TrainREINFORCEStrategy) reward_model = RayGroup(num_nodes=1, num_gpus_per_node=1, ray_actor_type=RewardModel) # reference_model = RayGroup(num_nodes=1, num_gpus_per_node=1, ray_actor_type=ReferenceModel) # TODO_RLOO if possible hide init inside RayGroup + # TODO add settings fields to reward model ray.get(policy_models.async_init_model_from_pretrained()) ray.get(reward_model.async_init_model_from_pretrained(rm_model=experiment_settings.reward_model_settings.model_path)) # ray.get(reference_model.async_init_model_from_pretrained(pretrain=experiment_settings.model_settings.model_path)) ''' - TODO_RLOO: - 1. SEED FIX + TODO_RLOO: 2. PARAMS to REINFORCETrainExperimentSettings 3. if possible hide creating of vllm engines inside trainer ''' @@ -150,7 +150,7 @@ def reinforce_training( num_engines=experiment_settings.trainer_settings.actor_settings.vllm_num_engines, tensor_parallel_size=experiment_settings.trainer_settings.actor_settings.vllm_tensor_parallel_size, pretrain=str(experiment_settings.model_settings.model_path), - seed=0, + seed=experiment_settings.seed, enable_prefix_caching=False, enforce_eager=False, max_model_len=experiment_settings.trainer_settings.actor_settings.max_model_len, diff --git a/turbo_alignment/generators/chat.py b/turbo_alignment/generators/chat.py index 440cbc0..e3580a9 100755 --- a/turbo_alignment/generators/chat.py +++ b/turbo_alignment/generators/chat.py @@ -43,10 +43,6 @@ def __init__( # if transformers_settings.num_beams > 1: # beam_search_params['use_beam_search'] = True # beam_search_params['best_of'] = transformers_settings.num_beams - print(f'Generation Params:{transformers_settings.stop_strings=}\n{transformers_settings.num_return_sequences=}\n\ - {transformers_settings.repetition_penalty=}\n{transformers_settings.temperature=}\n\ - {transformers_settings.top_p=}\n{transformers_settings.top_k=}\n\ - {custom_generation_settings.skip_special_tokens=}\n{self.eos_token_id=}\n{transformers_settings.max_new_tokens=}', flush=True) self._sampling_params = SamplingParams( n=transformers_settings.num_return_sequences, diff --git a/turbo_alignment/pipelines/train/reinforce.py b/turbo_alignment/pipelines/train/reinforce.py index 7e6c192..7ba1d87 100644 --- a/turbo_alignment/pipelines/train/reinforce.py +++ b/turbo_alignment/pipelines/train/reinforce.py @@ -41,9 +41,6 @@ class ReinforceDataCollator(DataCollatorForTokenClassification): def torch_call(self, features): import torch from transformers.data.data_collator import pad_without_fast_tokenizer_warning - - for _ in features: - print(f'{_.keys()=}') label_name = "label" if "label" in features[0].keys() else "labels" labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None diff --git a/turbo_alignment/settings/pipelines/train/reinforce.py b/turbo_alignment/settings/pipelines/train/reinforce.py index 6ac54dc..c530731 100644 --- a/turbo_alignment/settings/pipelines/train/reinforce.py +++ b/turbo_alignment/settings/pipelines/train/reinforce.py @@ -16,6 +16,8 @@ from typing import Union class REINFORCETrainerSettings(TrainerSettings): + + num_nodes: int = 2 max_new_tokens: int = 1024 stop_token: str = '' diff --git a/turbo_alignment/trainers/online/reinforce.py b/turbo_alignment/trainers/online/reinforce.py index eef8f3c..056ad47 100644 --- a/turbo_alignment/trainers/online/reinforce.py +++ b/turbo_alignment/trainers/online/reinforce.py @@ -48,6 +48,7 @@ # FIXME @dataclass class REINFORCETrainingArguments(TrainingArguments): + num_nodes: int = 2 max_new_tokens: int = 1024 stop_token: str = '' @@ -195,11 +196,15 @@ def __init__( mean_baseline_coef=args.mean_baseline_coef, num_generations=args.num_generations, ) + + print("Generations Params:\n" + "\n".join([f"{attr}: {getattr(self.generator_transformers_settings, attr, None)}" for attr, _ in self.generator_transformers_settings.__annotations__.items()])) + start = time.time() self.print_readable_stats() self.norm_reward_mean, self.norm_reward_std = self.reward_stats( model=self.model, dataloader=self.get_train_dataloader() ) + logging.info(f'statictis in __init__ elapsed time:{time.time() - start}') self.print_readable_stats() @@ -467,9 +472,6 @@ def prediction_step( ignore_keys: Optional[List[str]] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: - import logging - logging.info(f'{isinstance(model, DeepSpeedEngine)=}') - with torch.no_grad(): loss, metrics = self.get_batch_loss_metrics(model, inputs, 'eval') diff --git a/turbo_alignment/trainers/online/reward_actor.py b/turbo_alignment/trainers/online/reward_actor.py index 0c0d755..cd81a12 100644 --- a/turbo_alignment/trainers/online/reward_actor.py +++ b/turbo_alignment/trainers/online/reward_actor.py @@ -12,7 +12,7 @@ def __init__(self, world_size, rank, local_rank, master_addr, master_port): def init_model_from_pretrained(self, rm_model): self._setup_distributed() - self.model = AutoModelForSequenceClassification.from_pretrained(rm_model, device_map='cuda', torch_dtype=torch.bfloat16) + self.model = AutoModelForSequenceClassification.from_pretrained(rm_model, device_map='cuda', torch_dtype=torch.bfloat16, attn_implementation='flash_attention_2') self.tokenizer = AutoTokenizer.from_pretrained(rm_model, trust_remote_code=True) self.model.config.pad_token_id = self.model.config.eos_token_id