From c10cc8995b6fd45f3a876ec98cade97251abe733 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Tue, 26 Nov 2024 20:37:27 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=97=9D=EF=B8=8F=20Update=20type=20hints?=
 =?UTF-8?q?=20(#2399)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* New type hint structure

* Update type hints

* Delete wrong file

* Remove dict import
---
 examples/datasets/hh-rlhf-helpful-base.py     |  4 +-
 .../stack_llama/scripts/reward_modeling.py    |  4 +-
 .../stack_llama_2/scripts/dpo_llama2.py       | 10 +--
 examples/scripts/sft_video_llm.py             |  6 +-
 trl/commands/cli_utils.py                     |  2 +-
 trl/core.py                                   | 18 +++---
 trl/data_utils.py                             | 26 ++++----
 trl/extras/best_of_n_sampler.py               | 16 ++---
 trl/mergekit_utils.py                         |  4 +-
 trl/models/modeling_sd_base.py                | 36 +++++------
 trl/models/utils.py                           |  4 +-
 trl/trainer/alignprop_config.py               | 18 +++---
 trl/trainer/alignprop_trainer.py              | 16 ++---
 trl/trainer/bco_config.py                     | 10 +--
 trl/trainer/bco_trainer.py                    | 62 +++++++++---------
 trl/trainer/callbacks.py                      | 10 +--
 trl/trainer/cpo_config.py                     |  6 +-
 trl/trainer/cpo_trainer.py                    | 56 ++++++++--------
 trl/trainer/ddpo_trainer.py                   | 20 +++---
 trl/trainer/dpo_config.py                     | 10 +--
 trl/trainer/dpo_trainer.py                    | 64 +++++++++----------
 trl/trainer/gkd_config.py                     |  6 +-
 trl/trainer/gkd_trainer.py                    | 16 ++---
 trl/trainer/iterative_sft_trainer.py          | 58 ++++++++---------
 trl/trainer/judges.py                         | 64 +++++++++----------
 trl/trainer/kto_config.py                     | 10 +--
 trl/trainer/kto_trainer.py                    | 60 ++++++++---------
 trl/trainer/model_config.py                   | 10 +--
 trl/trainer/nash_md_config.py                 |  3 +-
 trl/trainer/nash_md_trainer.py                | 26 ++++----
 trl/trainer/online_dpo_config.py              |  4 +-
 trl/trainer/online_dpo_trainer.py             | 30 ++++-----
 trl/trainer/orpo_config.py                    |  6 +-
 trl/trainer/orpo_trainer.py                   | 56 ++++++++--------
 trl/trainer/ppo_trainer.py                    | 12 ++--
 trl/trainer/reward_trainer.py                 | 36 +++++------
 trl/trainer/rloo_trainer.py                   | 12 ++--
 trl/trainer/sft_config.py                     | 10 +--
 trl/trainer/sft_trainer.py                    | 22 +++----
 trl/trainer/utils.py                          | 52 +++++++--------
 trl/trainer/xpo_config.py                     |  5 +-
 trl/trainer/xpo_trainer.py                    | 26 ++++----
 42 files changed, 462 insertions(+), 464 deletions(-)

diff --git a/examples/datasets/hh-rlhf-helpful-base.py b/examples/datasets/hh-rlhf-helpful-base.py
index 5399615b5d..84d8010169 100644
--- a/examples/datasets/hh-rlhf-helpful-base.py
+++ b/examples/datasets/hh-rlhf-helpful-base.py
@@ -14,7 +14,7 @@
 
 import re
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Optional
 
 from datasets import load_dataset
 from transformers import HfArgumentParser
@@ -51,7 +51,7 @@ def common_start(str1: str, str2: str) -> str:
     return "".join(common_chars)
 
 
-def extract_dialogue(example: str) -> List[Dict[str, str]]:
+def extract_dialogue(example: str) -> list[dict[str, str]]:
     # Extract the prompt, which corresponds to the common start of the chosen and rejected dialogues
     prompt_text = common_start(example["chosen"], example["rejected"])
 
diff --git a/examples/research_projects/stack_llama/scripts/reward_modeling.py b/examples/research_projects/stack_llama/scripts/reward_modeling.py
index 8402413c03..db38f62d4c 100644
--- a/examples/research_projects/stack_llama/scripts/reward_modeling.py
+++ b/examples/research_projects/stack_llama/scripts/reward_modeling.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
 import evaluate
 import numpy as np
@@ -236,7 +236,7 @@ class RewardDataCollatorWithPadding:
     pad_to_multiple_of: Optional[int] = None
     return_tensors: str = "pt"
 
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
         features_j = []
         features_k = []
         for feature in features:
diff --git a/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py b/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
index 8530cd66c4..b3d287b144 100644
--- a/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+++ b/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
@@ -15,7 +15,7 @@
 # 0. imports
 import os
 from dataclasses import dataclass, field
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 from accelerate import Accelerator
@@ -109,9 +109,9 @@ def get_stack_exchange_paired(
 
     The dataset is converted to a dictionary with the following structure:
     {
-        'prompt': List[str],
-        'chosen': List[str],
-        'rejected': List[str],
+        'prompt': list[str],
+        'chosen': list[str],
+        'rejected': list[str],
     }
 
     Prompts are structured as follows:
@@ -126,7 +126,7 @@ def get_stack_exchange_paired(
     )
     original_columns = dataset.column_names
 
-    def return_prompt_and_responses(samples) -> Dict[str, str]:
+    def return_prompt_and_responses(samples) -> dict[str, str]:
         return {
             "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
             "chosen": samples["response_j"],
diff --git a/examples/scripts/sft_video_llm.py b/examples/scripts/sft_video_llm.py
index 78941c8363..3343c3a302 100644
--- a/examples/scripts/sft_video_llm.py
+++ b/examples/scripts/sft_video_llm.py
@@ -45,7 +45,7 @@
 import os
 import random
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any
 
 import requests
 import torch
@@ -90,7 +90,7 @@ def download_video(url: str, cache_dir: str) -> str:
         raise Exception(f"Failed to download video: {e}") from e
 
 
-def prepare_dataset(example: Dict[str, Any], cache_dir: str) -> Dict[str, List[Dict[str, Any]]]:
+def prepare_dataset(example: dict[str, Any], cache_dir: str) -> dict[str, list[dict[str, Any]]]:
     """Prepare dataset example for training."""
     video_url = example["video_url"]
     timecoded_cc = example["timecoded_cc"]
@@ -120,7 +120,7 @@ def prepare_dataset(example: Dict[str, Any], cache_dir: str) -> Dict[str, List[D
     return {"messages": messages}
 
 
-def collate_fn(examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+def collate_fn(examples: list[dict[str, Any]]) -> dict[str, torch.Tensor]:
     """Collate batch of examples for training."""
     texts = []
     video_inputs = []
diff --git a/trl/commands/cli_utils.py b/trl/commands/cli_utils.py
index b3ce930479..76cc777da2 100644
--- a/trl/commands/cli_utils.py
+++ b/trl/commands/cli_utils.py
@@ -158,7 +158,7 @@ def __init__(self, parsers, ignore_extra_args=False):
         with the processed parsers.
 
         Args:
-            parsers (`List[argparse.ArgumentParser`]):
+            parsers (`list[argparse.ArgumentParser`]):
                 List of parsers.
             ignore_extra_args (`bool`):
                 Whether to ignore extra arguments passed by the config
diff --git a/trl/core.py b/trl/core.py
index 5e7bb840f6..bfb23ccd3b 100644
--- a/trl/core.py
+++ b/trl/core.py
@@ -15,7 +15,7 @@
 import random
 import warnings
 from contextlib import contextmanager
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -70,10 +70,10 @@ def top_k_top_p_filtering(
     return logits
 
 
-def flatten_dict(nested: Dict, sep: str = "/") -> Dict:
+def flatten_dict(nested: dict, sep: str = "/") -> dict:
     """Flatten dictionary and concatenate nested keys with separator."""
 
-    def recurse(nest: Dict, prefix: str, into: Dict) -> None:
+    def recurse(nest: dict, prefix: str, into: dict) -> None:
         for k, v in nest.items():
             if sep in k:
                 raise ValueError(f"separator '{sep}' not allowed to be in key '{k}'")
@@ -87,7 +87,7 @@ def recurse(nest: Dict, prefix: str, into: Dict) -> None:
     return flat
 
 
-def convert_to_scalar(stats: Dict) -> Dict:
+def convert_to_scalar(stats: dict) -> dict:
     """
     Converts the stats from a flattened dict to single scalar dicts
     """
@@ -103,7 +103,7 @@ def convert_to_scalar(stats: Dict) -> Dict:
     return tensorboard_stats
 
 
-def stack_dicts(stats_dicts: List[Dict]) -> Dict:
+def stack_dicts(stats_dicts: list[dict]) -> dict:
     """Stack the values of a dict."""
     results = dict()
     for k in stats_dicts[0]:
@@ -185,7 +185,7 @@ def entropy_from_logits(logits: torch.Tensor) -> torch.Tensor:
     return entropy
 
 
-def stats_to_np(stats_dict: Dict) -> Dict:
+def stats_to_np(stats_dict: dict) -> dict:
     """Cast all torch.tensors in dict to numpy arrays."""
     new_dict = dict()
     for k, v in stats_dict.items():
@@ -202,7 +202,7 @@ def stats_to_np(stats_dict: Dict) -> Dict:
 
 
 def respond_to_batch(
-    model: nn.Module, queries: List[torch.LongTensor], txt_len: int = 20, top_k: int = 0, top_p: float = 1.0
+    model: nn.Module, queries: list[torch.LongTensor], txt_len: int = 20, top_k: int = 0, top_p: float = 1.0
 ) -> torch.LongTensor:
     """Sample text from language model."""
     input_ids = queries
@@ -271,8 +271,8 @@ def empty_device_cache(cls):
 
 
 def randn_tensor(
-    shape: Union[Tuple, List],
-    generator: Optional[Union[List[torch.Generator], torch.Generator]] = None,
+    shape: Union[tuple, list],
+    generator: Optional[Union[list[torch.Generator], torch.Generator]] = None,
     device: Optional[torch.device] = None,
     dtype: Optional[torch.dtype] = None,
     layout: Optional[torch.layout] = None,
diff --git a/trl/data_utils.py b/trl/data_utils.py
index 146466bd6b..88319626b8 100644
--- a/trl/data_utils.py
+++ b/trl/data_utils.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, List, Optional, Sequence, TypeVar
+from typing import Any, Optional, Sequence, TypeVar
 
 from datasets import Dataset, DatasetDict
 from transformers import PreTrainedTokenizer
@@ -20,12 +20,12 @@
 DatasetType = TypeVar("DatasetType", Dataset, DatasetDict)
 
 
-def is_conversational(example: Dict[str, Any]) -> bool:
+def is_conversational(example: dict[str, Any]) -> bool:
     r"""
     Check if the example is in a conversational format.
 
     Args:
-        example (`Dict[str, Any]`):
+        example (`dict[str, Any]`):
             A single data entry of a dataset. The example can have different keys depending on the
             dataset type.
 
@@ -60,7 +60,7 @@ def is_conversational(example: Dict[str, Any]) -> bool:
     return False
 
 
-def apply_chat_template(example: Dict[str, List[Dict[str, str]]], tokenizer: PreTrainedTokenizer) -> Dict[str, str]:
+def apply_chat_template(example: dict[str, list[dict[str, str]]], tokenizer: PreTrainedTokenizer) -> dict[str, str]:
     r"""
     Apply a chat template to a conversational example.
 
@@ -139,13 +139,13 @@ def apply_chat_template(example: Dict[str, List[Dict[str, str]]], tokenizer: Pre
 
 
 def maybe_apply_chat_template(
-    example: Dict[str, List[Dict[str, str]]], tokenizer: PreTrainedTokenizer
-) -> Dict[str, str]:
+    example: dict[str, list[dict[str, str]]], tokenizer: PreTrainedTokenizer
+) -> dict[str, str]:
     r"""
     If the example is in a conversational format, apply a chat template to it.
 
     Args:
-        example (`Dict[str, List[Dict[str, str]]`):
+        example (`dict[str, list[dict[str, str]]`):
             Dictionary representing a single data entry of a conversational dataset. Each data entry can have different
             keys depending on the dataset type. The supported dataset types are:
 
@@ -163,7 +163,7 @@ def maybe_apply_chat_template(
             The tokenizer to apply the chat template with.
 
     Returns:
-        `Dict[str, str]`: The formatted example with the chat template applied.
+        `dict[str, str]`: The formatted example with the chat template applied.
 
     Note:
         This function does not alter the keys, except for Language modeling dataset, where `"messages"` is replaced by
@@ -188,7 +188,7 @@ def maybe_apply_chat_template(
         return example
 
 
-def _unpair_row(examples: List[Dict[str, List[Dict[str, str]]]]) -> List[Dict[str, List[Dict[str, str]]]]:
+def _unpair_row(examples: list[dict[str, list[dict[str, str]]]]) -> list[dict[str, list[dict[str, str]]]]:
     batch_size = len(examples["chosen"])
     new_rows = {
         "completion": examples["chosen"] + examples["rejected"],
@@ -288,7 +288,7 @@ def maybe_unpair_preference_dataset(
         return dataset
 
 
-def extract_prompt(example: Dict[str, Sequence]) -> Dict[str, Sequence]:
+def extract_prompt(example: dict[str, Sequence]) -> dict[str, Sequence]:
     r"""
     Extracts the shared prompt from a preference data example, where the prompt is implicit within both
     the chosen and rejected completions.
@@ -307,7 +307,7 @@ def extract_prompt(example: Dict[str, Sequence]) -> Dict[str, Sequence]:
     }
 
 
-def maybe_extract_prompt(example: Dict[str, List]) -> Dict[str, List]:
+def maybe_extract_prompt(example: dict[str, list]) -> dict[str, list]:
     r"""
     Extracts the shared prompt from a preference data example, where the prompt is implicit within both
     the chosen and rejected completions.
@@ -318,12 +318,12 @@ def maybe_extract_prompt(example: Dict[str, List]) -> Dict[str, List]:
     "rejected" completions.
 
     Args:
-        example (`Dict[str, List]`):
+        example (`dict[str, list]`):
             A dictionary representing a single data entry in the preference dataset. It must contain the keys
             `"chosen"` and `"rejected"`, where each value is either conversational or standard (`str`).
 
     Returns:
-        `Dict[str, List]`: A dictionary containing:
+        `dict[str, list]`: A dictionary containing:
             - `"prompt"`: The longest common prefix between the "chosen" and "rejected" completions.
             - `"chosen"`: The remainder of the "chosen" completion, with the prompt removed.
             - `"rejected"`: The remainder of the "rejected" completion, with the prompt removed.
diff --git a/trl/extras/best_of_n_sampler.py b/trl/extras/best_of_n_sampler.py
index 5f3363f4c9..646cee1318 100644
--- a/trl/extras/best_of_n_sampler.py
+++ b/trl/extras/best_of_n_sampler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from transformers import GenerationConfig, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -26,7 +26,7 @@ def __init__(
         self,
         model: PreTrainedModelWrapper,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        queries_to_scores: Callable[[List[str]], List[float]],
+        queries_to_scores: Callable[[list[str]], list[float]],
         length_sampler: Any,
         sample_size: int = 4,
         seed: Optional[int] = None,
@@ -41,7 +41,7 @@ def __init__(
                 The pretrained model to use for generation
             tokenizer (`PreTrainedTokenizer` or `PreTrainedTokenizerFast`):
                 Tokenizer associated with the pretrained model
-            queries_to_scores (`Callable[[List[str]], List[float]]`):
+            queries_to_scores (`Callable[[list[str]], list[float]]`):
                 Callable that takes a list of generated texts and returns the associated reward scores
             length_sampler (`Any`):
                 Sampler used to sample the length of the generated text
@@ -78,16 +78,16 @@ def __init__(
 
     def generate(
         self,
-        tokenized_query: Union[List[int], torch.Tensor, List[torch.Tensor], List[List[int]]],
+        tokenized_query: Union[list[int], torch.Tensor, list[torch.Tensor], list[list[int]]],
         skip_special_tokens: bool = True,
         device: Optional[Union[str, torch.device]] = None,
         **generation_kwargs,
-    ) -> List[List[str]]:
+    ) -> list[list[str]]:
         r"""
         Generate the best of n samples for input queries
 
         Args:
-            tokenized_query (`List[int]` or `torch.Tensor` or `List[torch.Tensor]` or `List[int]`):
+            tokenized_query (`list[int]` or `torch.Tensor` or `list[torch.Tensor]` or `list[int]`):
                 represents either a single tokenized query (a single tensor or a list of integers) or a batch of tokenized queries (a list of tensors or a list of lists of integers)
             skip_special_tokens (`bool`):
                 Whether to remove the special tokens from the output
@@ -98,13 +98,13 @@ def generate(
                 This is used to override generation config
 
         Returns:
-            List[List[str]]: A list of lists of generated texts
+            list[list[str]]: A list of lists of generated texts
         """
         queries = None
 
         if isinstance(tokenized_query, torch.Tensor) and tokenized_query.ndim == 1:
             queries = tokenized_query.unsqueeze(0)
-        elif isinstance(tokenized_query, List):
+        elif isinstance(tokenized_query, list):
             element_type = type(tokenized_query[0])
             if element_type is int:
                 queries = torch.tensor(tokenized_query).unsqueeze(0)
diff --git a/trl/mergekit_utils.py b/trl/mergekit_utils.py
index ab223a9eeb..936c42626c 100644
--- a/trl/mergekit_utils.py
+++ b/trl/mergekit_utils.py
@@ -63,8 +63,8 @@ class MergeConfig:
         target_model_path (`Optional[str]`): Path to the target model.
         policy_model_weight (`float`): Weight for the policy model (for `linear` and `ties` methods).
         target_model_weight (`float`): Weight for the target model (for `linear` and `ties` methods).
-        policy_model_density (`List[float]`): Density parameters for the policy model (for `ties` and `dare_ties`).
-        target_model_density (`List[float]`): Density parameters for the target model (for `ties` and `dare_ties`).
+        policy_model_density (`list[float]`): Density parameters for the policy model (for `ties` and `dare_ties`).
+        target_model_density (`list[float]`): Density parameters for the target model (for `ties` and `dare_ties`).
         normalize (`Optional[float]`): Normalization factor for the TIES method.
         t_values (`Optional[float]`): Interpolation factor for the SLERP method.
         dtype (`str`): Data type to use for merging, e.g., `"float16"`.
diff --git a/trl/models/modeling_sd_base.py b/trl/models/modeling_sd_base.py
index 2fee8d6b8b..fbd4fe5b2d 100644
--- a/trl/models/modeling_sd_base.py
+++ b/trl/models/modeling_sd_base.py
@@ -17,7 +17,7 @@
 import random
 import warnings
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import torch
@@ -43,9 +43,9 @@ class DDPOPipelineOutput:
     Args:
         images (`torch.Tensor`):
             The generated images.
-        latents (`List[torch.Tensor]`):
+        latents (`list[torch.Tensor]`):
             The latents used to generate the images.
-        log_probs (`List[torch.Tensor]`):
+        log_probs (`list[torch.Tensor]`):
             The log probabilities of the latents.
 
     """
@@ -161,7 +161,7 @@ def _left_broadcast(input_tensor, shape):
     from left to right
         Args:
             input_tensor (`torch.FloatTensor`): is the tensor to broadcast
-            shape (`Tuple[int]`): is the shape to broadcast to
+            shape (`tuple[int]`): is the shape to broadcast to
     """
     input_ndim = input_tensor.ndim
     if input_ndim > len(shape):
@@ -325,15 +325,15 @@ def scheduler_step(
 @torch.no_grad()
 def pipeline_step(
     self,
-    prompt: Optional[Union[str, List[str]]] = None,
+    prompt: Optional[Union[str, list[str]]] = None,
     height: Optional[int] = None,
     width: Optional[int] = None,
     num_inference_steps: int = 50,
     guidance_scale: float = 7.5,
-    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt: Optional[Union[str, list[str]]] = None,
     num_images_per_prompt: Optional[int] = 1,
     eta: float = 0.0,
-    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
     latents: Optional[torch.FloatTensor] = None,
     prompt_embeds: Optional[torch.FloatTensor] = None,
     negative_prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -341,11 +341,11 @@ def pipeline_step(
     return_dict: bool = True,
     callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
     callback_steps: int = 1,
-    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    cross_attention_kwargs: Optional[dict[str, Any]] = None,
     guidance_rescale: float = 0.0,
 ):
     r"""
-    Function invoked when calling the pipeline for generation.  Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.  instead.  height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image.
+    Function invoked when calling the pipeline for generation.  Args: prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.  instead.  height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image.
         width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
             The width in pixels of the generated image.
         num_inference_steps (`int`, *optional*, defaults to 50):
@@ -357,7 +357,7 @@ def pipeline_step(
             Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale >
             1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
             usually at the expense of lower image quality.
-        negative_prompt (`str` or `List[str]`, *optional*):
+        negative_prompt (`str` or `list[str]`, *optional*):
             The prompt or prompts not to guide the image generation. If not defined, one has to pass
             `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
             less than `1`).
@@ -366,7 +366,7 @@ def pipeline_step(
         eta (`float`, *optional*, defaults to 0.0):
             Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only applies to
             [`schedulers.DDIMScheduler`], will be ignored for others.
-        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+        generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
             One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
             to make generation deterministic.
         latents (`torch.FloatTensor`, *optional*):
@@ -531,7 +531,7 @@ def pipeline_step(
 
 def pipeline_step_with_grad(
     pipeline,
-    prompt: Optional[Union[str, List[str]]] = None,
+    prompt: Optional[Union[str, list[str]]] = None,
     height: Optional[int] = None,
     width: Optional[int] = None,
     num_inference_steps: int = 50,
@@ -541,10 +541,10 @@ def pipeline_step_with_grad(
     gradient_checkpoint: bool = True,
     truncated_backprop_timestep: int = 49,
     truncated_rand_backprop_minmax: tuple = (0, 50),
-    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt: Optional[Union[str, list[str]]] = None,
     num_images_per_prompt: Optional[int] = 1,
     eta: float = 0.0,
-    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
     latents: Optional[torch.FloatTensor] = None,
     prompt_embeds: Optional[torch.FloatTensor] = None,
     negative_prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -552,14 +552,14 @@ def pipeline_step_with_grad(
     return_dict: bool = True,
     callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
     callback_steps: int = 1,
-    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    cross_attention_kwargs: Optional[dict[str, Any]] = None,
     guidance_rescale: float = 0.0,
 ):
     r"""
     Function to get RGB image with gradients attached to the model weights.
 
     Args:
-        prompt (`str` or `List[str]`, *optional*, defaults to `None`):
+        prompt (`str` or `list[str]`, *optional*, defaults to `None`):
             The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` instead.
         height (`int`, *optional*, defaults to `pipeline.unet.config.sample_size * pipeline.vae_scale_factor`):
             The height in pixels of the generated image.
@@ -588,7 +588,7 @@ def pipeline_step_with_grad(
         truncated_rand_backprop_minmax (`Tuple`, *optional*, defaults to (0,50)):
             Range for randomized backprop. Here the value at 0 index indicates the earlier diffusion timestep to update (closer to noise), while the value
             at index 1 indicates the later diffusion timestep to update.
-        negative_prompt (`str` or `List[str]`, *optional*):
+        negative_prompt (`str` or `list[str]`, *optional*):
             The prompt or prompts not to guide the image generation. If not defined, one has to pass
             `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
             less than `1`).
@@ -597,7 +597,7 @@ def pipeline_step_with_grad(
         eta (`float`, *optional*, defaults to 0.0):
             Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only applies to
             [`schedulers.DDIMScheduler`], will be ignored for others.
-        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+        generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
             One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
             to make generation deterministic.
         latents (`torch.FloatTensor`, *optional*):
diff --git a/trl/models/utils.py b/trl/models/utils.py
index 562b8617ed..0ff52461e0 100644
--- a/trl/models/utils.py
+++ b/trl/models/utils.py
@@ -15,7 +15,7 @@
 import itertools
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Literal, Optional, Union
 
 from accelerate.utils import is_deepspeed_available
 from transformers import PreTrainedModel, PreTrainedTokenizer
@@ -80,7 +80,7 @@ def setup_chat_format(
     tokenizer: PreTrainedTokenizer,
     format: Optional[Literal["chatml"]] = "chatml",
     resize_to_multiple_of: Optional[int] = None,
-) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
     """
     Setup chat format by adding special tokens to the tokenizer, setting the correct format, and extending the embedding layer of the model based on the new special tokens.
 
diff --git a/trl/trainer/alignprop_config.py b/trl/trainer/alignprop_config.py
index 899958a2fe..5817c45fe9 100644
--- a/trl/trainer/alignprop_config.py
+++ b/trl/trainer/alignprop_config.py
@@ -16,7 +16,7 @@
 import sys
 import warnings
 from dataclasses import dataclass, field
-from typing import Any, Dict, Literal, Optional, Tuple
+from typing import Any, Literal, Optional
 
 from transformers import is_bitsandbytes_available, is_torchvision_available
 
@@ -42,11 +42,11 @@ class AlignPropConfig:
             [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details.
         log_image_freq (`int`, *optional*, defaults to `1`):
             Frequency for logging images.
-        tracker_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`):
+        tracker_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`):
             Keyword arguments for the tracker (e.g., `wandb_project`).
-        accelerator_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`):
+        accelerator_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`):
             Keyword arguments for the accelerator.
-        project_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`):
+        project_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`):
             Keyword arguments for the accelerator project config (e.g., `logging_dir`).
         tracker_project_name (`str`, *optional*, defaults to `"trl"`):
             Name of project to use for tracking.
@@ -92,7 +92,7 @@ class AlignPropConfig:
             If `True`, randomized truncation to different diffusion timesteps is used.
         truncated_backprop_timestep (`int`, *optional*, defaults to `49`):
             Absolute timestep to which the gradients are backpropagated. Used only if `truncated_backprop_rand=False`.
-        truncated_rand_backprop_minmax (`Tuple[int, int]`, *optional*, defaults to `(0, 50)`):
+        truncated_rand_backprop_minmax (`tuple[int, int]`, *optional*, defaults to `(0, 50)`):
             Range of diffusion timesteps for randomized truncated backpropagation.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether to push the final model to the Hub.
@@ -103,9 +103,9 @@ class AlignPropConfig:
     seed: int = 0
     log_with: Optional[Literal["wandb", "tensorboard"]] = None
     log_image_freq: int = 1
-    tracker_kwargs: Dict[str, Any] = field(default_factory=dict)
-    accelerator_kwargs: Dict[str, Any] = field(default_factory=dict)
-    project_kwargs: Dict[str, Any] = field(default_factory=dict)
+    tracker_kwargs: dict[str, Any] = field(default_factory=dict)
+    accelerator_kwargs: dict[str, Any] = field(default_factory=dict)
+    project_kwargs: dict[str, Any] = field(default_factory=dict)
     tracker_project_name: str = "trl"
     logdir: str = "logs"
     num_epochs: int = 100
@@ -129,7 +129,7 @@ class AlignPropConfig:
     negative_prompts: Optional[str] = None
     truncated_backprop_rand: bool = True
     truncated_backprop_timestep: int = 49
-    truncated_rand_backprop_minmax: Tuple[int, int] = (0, 50)
+    truncated_rand_backprop_minmax: tuple[int, int] = (0, 50)
     push_to_hub: bool = False
 
     def to_dict(self):
diff --git a/trl/trainer/alignprop_trainer.py b/trl/trainer/alignprop_trainer.py
index 84776a026b..0fccba6a53 100644
--- a/trl/trainer/alignprop_trainer.py
+++ b/trl/trainer/alignprop_trainer.py
@@ -14,7 +14,7 @@
 import os
 import textwrap
 from collections import defaultdict
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 from warnings import warn
 
 import torch
@@ -43,9 +43,9 @@ class AlignPropTrainer(BaseTrainer):
     Attributes:
         config (`AlignPropConfig`):
             Configuration object for AlignPropTrainer. Check the documentation of `PPOConfig` for more details.
-        reward_function (`Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor]`):
+        reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`):
             Reward function to be used
-        prompt_function (`Callable[[], Tuple[str, Any]]`):
+        prompt_function (`Callable[[], tuple[str, Any]]`):
             Function to generate prompts to guide model
         sd_pipeline (`DDPOStableDiffusionPipeline`):
             Stable Diffusion pipeline to be used for training.
@@ -58,8 +58,8 @@ class AlignPropTrainer(BaseTrainer):
     def __init__(
         self,
         config: AlignPropConfig,
-        reward_function: Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor],
-        prompt_function: Callable[[], Tuple[str, Any]],
+        reward_function: Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor],
+        prompt_function: Callable[[], tuple[str, Any]],
         sd_pipeline: DDPOStableDiffusionPipeline,
         image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None,
     ):
@@ -325,7 +325,7 @@ def _generate_samples(self, batch_size, with_grad=True, prompts=None):
             with_grad (bool): Whether the generated RGBs should have gradients attached to it.
 
         Returns:
-            prompt_image_pairs (Dict[Any])
+            prompt_image_pairs (dict[Any])
         """
         prompt_image_pairs = {}
 
@@ -394,7 +394,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -404,7 +404,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/bco_config.py b/trl/trainer/bco_config.py
index 2d7a3f7a09..61729576ec 100644
--- a/trl/trainer/bco_config.py
+++ b/trl/trainer/bco_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from transformers import TrainingArguments
 
@@ -54,10 +54,10 @@ class BCOConfig(TrainingArguments):
         precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
             Whether to precompute reference model log probabilities for training and evaluation datasets. This is
             useful when training without the reference model to reduce the total GPU memory needed.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
             from a string.
         dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
@@ -80,8 +80,8 @@ class BCOConfig(TrainingArguments):
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
     precompute_ref_log_probs: bool = False
-    model_init_kwargs: Optional[Dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
+    model_init_kwargs: Optional[dict[str, Any]] = None
+    ref_model_init_kwargs: Optional[dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
     prompt_sample_size: int = 1024
     min_density_ratio: float = 0.5
diff --git a/trl/trainer/bco_trainer.py b/trl/trainer/bco_trainer.py
index 287cce8436..6b7a8c4a8d 100644
--- a/trl/trainer/bco_trainer.py
+++ b/trl/trainer/bco_trainer.py
@@ -21,7 +21,7 @@
 from contextlib import contextmanager, nullcontext
 from copy import deepcopy
 from operator import itemgetter
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -84,10 +84,10 @@
 
 
 def _tokenize(
-    batch: Dict[str, List[Any]],
+    batch: dict[str, list[Any]],
     tokenizer: "PreTrainedTokenizer",
     embedding_tokenizer: Optional["PreTrainedTokenizer"] = None,
-) -> Dict[str, List[Any]]:
+) -> dict[str, list[Any]]:
     """Tokenize a batch from a BCO specific dataset."""
     prompt_tokenized = tokenizer(batch["prompt"], add_special_tokens=False)
     prompt_input_ids = prompt_tokenized["input_ids"]
@@ -152,7 +152,7 @@ def _tokenize(
     return output
 
 
-def _process_tokens(example: Dict[str, Any], model: "PreTrainedModel" = None, **kwargs) -> Dict:
+def _process_tokens(example: dict[str, Any], model: "PreTrainedModel" = None, **kwargs) -> dict:
     """Process tokens of a BCO specific dataset.
 
     At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
@@ -300,17 +300,17 @@ class BCOTrainer(Trainer):
             which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences.
         model_init (`Callable[[], transformers.PreTrainedModel]`):
             The model initializer to use for training. If None is specified, the default model initializer will be used.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
-        peft_config (`Dict`, defaults to `None`):
+        peft_config (`dict`, defaults to `None`):
             The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
         disable_dropout (`bool`, defaults to `True`):
             Whether or not to disable dropouts in `model` and `ref_model`.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
         model_adapter_name (`str`, defaults to `None`):
@@ -327,17 +327,17 @@ def __init__(
         ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
         args: BCOConfig = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         data_collator: Optional[DataCollator] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
         model_adapter_name: Optional[str] = None,
         ref_adapter_name: Optional[str] = None,
         embedding_func: Optional[Callable] = None,
@@ -782,8 +782,8 @@ def _vectorize_prompt(self, input_ids: torch.LongTensor, attention_mask: torch.L
         return embeddings
 
     def _get_prompt_embeddings(
-        self, batch: Dict[str, Union[List, torch.LongTensor]]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        self, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Extract embeddings from frozen embedding model"""
 
         if not self.match_underlying_distribution:
@@ -988,7 +988,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
 
         return super().get_eval_dataloader(eval_dataset=eval_dataset)
 
-    def compute_reference_log_probs(self, padded_batch: Dict) -> Dict:
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
         """Computes log probabilities of the reference model for a single padded batch of a BCO specific dataset."""
         with torch.no_grad():
             if self.ref_model is None:
@@ -1072,8 +1072,8 @@ def get_batch_logps(
             return (per_token_logps * loss_mask).sum(-1)
 
     def forward(
-        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         model_kwargs = (
             {
                 "labels": batch["completion_labels"],
@@ -1137,7 +1137,7 @@ def bco_loss(
         reference_rejected_logps: torch.FloatTensor,
         chosen_embeddings: Optional[torch.FloatTensor],
         rejected_embeddings: Optional[torch.FloatTensor],
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Compute the BCO loss for a batch of policy and reference model log probabilities.
 
         Args:
@@ -1194,7 +1194,7 @@ def bco_loss(
     def get_batch_loss_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
     ):
         """Compute the BCO loss and other metrics for the given batch of inputs for train or test."""
         metrics = {}
@@ -1274,10 +1274,10 @@ def get_batch_loss_metrics(
     def compute_loss(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         return_outputs=False,
         num_items_in_batch=None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
         if not self.use_dpo_data_collator:
             warnings.warn(
                 "compute_loss is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
@@ -1298,7 +1298,7 @@ def compute_loss(
             return (loss, metrics)
         return loss
 
-    def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
         for key, value in metrics.items():
             self._stored_metrics[train_eval][key].append(value)
 
@@ -1307,7 +1307,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
             return None
         return SequentialSampler(self.train_dataset)
 
-    def generate_from_model_and_ref(self, model, batch: Dict[str, torch.LongTensor]) -> Tuple[str, str]:
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
         """Generate samples from the model and reference model for the given batch of inputs."""
 
         # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
@@ -1355,9 +1355,9 @@ def generate_from_model_and_ref(self, model, batch: Dict[str, torch.LongTensor])
     def prediction_step(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
     ):
         if not self.use_dpo_data_collator:
             warnings.warn(
@@ -1397,7 +1397,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -1448,12 +1448,12 @@ def evaluation_loop(
 
         return initial_output
 
-    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training, including stored metrics.
 
         Args:
-            logs (`Dict[str, float]`):
+            logs (`dict[str, float]`):
                 The values to log.
             start_time (`float` or `None`, *optional*, defaults to `None`):
                 Start time of the training.
@@ -1491,7 +1491,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -1501,7 +1501,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or None, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or None, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py
index 7183881715..dbee1f60a2 100644
--- a/trl/trainer/callbacks.py
+++ b/trl/trainer/callbacks.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import pandas as pd
 import torch
@@ -48,18 +48,18 @@
 
 
 def _generate_completions(
-    prompts: List[str],
+    prompts: list[str],
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizerBase,
     accelerator: Accelerator,
     generation_config: Optional[GenerationConfig],
     batch_size: int = 1,
-) -> List[str]:
+) -> list[str]:
     """
     Generates completions for a list of pre-formatted prompts from the given model.
 
     Args:
-        prompts (List[str]): A list of input prompts for which completions are to be generated.
+        prompts (list[str]): A list of input prompts for which completions are to be generated.
         model (PreTrainedModel): The pre-trained model to be used for generation.
         tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for encoding and decoding.
         accelerator (Accelerator): The accelerator to be used for model execution.
@@ -67,7 +67,7 @@ def _generate_completions(
         batch_size (int, optional): The number of prompts to process in each batch. Default is 1.
 
     Returns:
-        List[str]: A list of generated text completions corresponding to the input prompts.
+        list[str]: A list of generated text completions corresponding to the input prompts.
     """
     completions = []
     with unwrap_model_for_generation(model, accelerator) as unwrapped_model:
diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py
index ac45b203e5..91d3008533 100644
--- a/trl/trainer/cpo_config.py
+++ b/trl/trainer/cpo_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -70,7 +70,7 @@ class CPOConfig(TrainingArguments):
         is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
             you need to specify if the model returned by the callable is an encoder-decoder model.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
         dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
@@ -92,5 +92,5 @@ class CPOConfig(TrainingArguments):
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
-    model_init_kwargs: Optional[Dict[str, Any]] = None
+    model_init_kwargs: Optional[dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
diff --git a/trl/trainer/cpo_trainer.py b/trl/trainer/cpo_trainer.py
index ccf1fcc5b1..a1153dec5d 100644
--- a/trl/trainer/cpo_trainer.py
+++ b/trl/trainer/cpo_trainer.py
@@ -20,7 +20,7 @@
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -91,15 +91,15 @@ class CPOTrainer(Trainer):
             reuse the fine-tuned model.
         model_init (`Callable[[], transformers.PreTrainedModel]`):
             The model initializer to use for training. If None is specified, the default model initializer will be used.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
-        peft_config (`Dict`, defaults to `None`):
+        peft_config (`dict`, defaults to `None`):
             The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
     """
@@ -115,16 +115,16 @@ def __init__(
         args: Optional[CPOConfig] = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
     ):
         if args.model_init_kwargs is None:
             model_init_kwargs = {}
@@ -422,7 +422,7 @@ def build_tokenized_answer(self, prompt, answer):
             attention_mask=answer_attention_mask,
         )
 
-    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> Dict:
+    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> dict:
         """Tokenize a single row from a CPO specific dataset.
 
         At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
@@ -570,12 +570,12 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
 
     @staticmethod
     def concatenated_inputs(
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         is_encoder_decoder: bool = False,
         label_pad_token_id: int = -100,
         padding_value: int = 0,
         device: Optional[torch.device] = None,
-    ) -> Dict[str, torch.LongTensor]:
+    ) -> dict[str, torch.LongTensor]:
         """Concatenate the chosen and rejected inputs into a single tensor.
 
         Args:
@@ -634,7 +634,7 @@ def cpo_loss(
         self,
         policy_chosen_logps: torch.FloatTensor,
         policy_rejected_logps: torch.FloatTensor,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Compute the CPO loss for a batch of policy and reference model log probabilities.
 
         Args:
@@ -720,8 +720,8 @@ def get_batch_logps(
             return (per_token_logps * loss_mask).sum(-1)
 
     def concatenated_forward(
-        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
 
         We do this to avoid doing two forward passes, because it's faster for FSDP.
@@ -797,7 +797,7 @@ def cross_entropy_loss(logits, labels):
     def get_batch_loss_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         train_eval: Literal["train", "eval"] = "train",
     ):
         """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
@@ -841,10 +841,10 @@ def get_batch_loss_metrics(
     def compute_loss(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         return_outputs=False,
         num_items_in_batch=None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
         if not self.use_dpo_data_collator:
             warnings.warn(
                 "compute_loss is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
@@ -863,7 +863,7 @@ def compute_loss(
             return (loss, metrics)
         return loss
 
-    def generate_from_model(self, model, batch: Dict[str, torch.LongTensor]) -> str:
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
         """Generate samples from the model and reference model for the given batch of inputs."""
 
         # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
@@ -887,9 +887,9 @@ def generate_from_model(self, model, batch: Dict[str, torch.LongTensor]) -> str:
     def prediction_step(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
     ):
         if not self.use_dpo_data_collator:
             warnings.warn(
@@ -924,7 +924,7 @@ def prediction_step(
 
         return (loss.detach(), logits, labels)
 
-    def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
         for key, value in metrics.items():
             self._stored_metrics[train_eval][key].append(value)
 
@@ -933,7 +933,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -976,12 +976,12 @@ def evaluation_loop(
 
         return initial_output
 
-    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training, including stored metrics.
 
         Args:
-            logs (`Dict[str, float]`):
+            logs (`dict[str, float]`):
                 The values to log.
             start_time (`float` or `None`, *optional*, defaults to `None`):
                 Start time of the training.
@@ -1025,7 +1025,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -1035,7 +1035,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/ddpo_trainer.py b/trl/trainer/ddpo_trainer.py
index 412a461a30..4ef0c82c82 100644
--- a/trl/trainer/ddpo_trainer.py
+++ b/trl/trainer/ddpo_trainer.py
@@ -16,7 +16,7 @@
 import textwrap
 from collections import defaultdict
 from concurrent import futures
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 from warnings import warn
 
 import torch
@@ -46,8 +46,8 @@ class DDPOTrainer(BaseTrainer):
     Attributes:
         **config** (`DDPOConfig`) -- Configuration object for DDPOTrainer. Check the documentation of `PPOConfig` for more
          details.
-        **reward_function** (Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor]) -- Reward function to be used
-        **prompt_function** (Callable[[], Tuple[str, Any]]) -- Function to generate prompts to guide model
+        **reward_function** (Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]) -- Reward function to be used
+        **prompt_function** (Callable[[], tuple[str, Any]]) -- Function to generate prompts to guide model
         **sd_pipeline** (`DDPOStableDiffusionPipeline`) -- Stable Diffusion pipeline to be used for training.
         **image_samples_hook** (Optional[Callable[[Any, Any, Any], Any]]) -- Hook to be called to log images
     """
@@ -57,8 +57,8 @@ class DDPOTrainer(BaseTrainer):
     def __init__(
         self,
         config: DDPOConfig,
-        reward_function: Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor],
-        prompt_function: Callable[[], Tuple[str, Any]],
+        reward_function: Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor],
+        prompt_function: Callable[[], tuple[str, Any]],
         sd_pipeline: DDPOStableDiffusionPipeline,
         image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None,
     ):
@@ -437,7 +437,7 @@ def _generate_samples(self, iterations, batch_size):
             batch_size (int): Batch size to use for sampling
 
         Returns:
-            samples (List[Dict[str, torch.Tensor]]), prompt_image_pairs (List[List[Any]])
+            samples (list[dict[str, torch.Tensor]]), prompt_image_pairs (list[list[Any]])
         """
         samples = []
         prompt_image_pairs = []
@@ -498,7 +498,7 @@ def _train_batched_samples(self, inner_epoch, epoch, global_step, batched_sample
             inner_epoch (int): The current inner epoch
             epoch (int): The current epoch
             global_step (int): The current global step
-            batched_samples (List[Dict[str, torch.Tensor]]): The batched samples to train on
+            batched_samples (list[dict[str, torch.Tensor]]): The batched samples to train on
 
         Side Effects:
             - Model weights are updated
@@ -551,7 +551,7 @@ def _train_batched_samples(self, inner_epoch, epoch, global_step, batched_sample
                     info = defaultdict(list)
         return global_step
 
-    def _config_check(self) -> Tuple[bool, str]:
+    def _config_check(self) -> tuple[bool, str]:
         samples_per_epoch = (
             self.config.sample_batch_size * self.accelerator.num_processes * self.config.sample_num_batches_per_epoch
         )
@@ -596,7 +596,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -606,7 +606,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
index 964b29bbe7..dec8e93bc8 100644
--- a/trl/trainer/dpo_config.py
+++ b/trl/trainer/dpo_config.py
@@ -14,7 +14,7 @@
 import warnings
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -96,10 +96,10 @@ class DPOConfig(TrainingArguments):
             useful when training without the reference model to reduce the total GPU memory needed.
         dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
             from a string.
         model_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
@@ -174,8 +174,8 @@ class DPOConfig(TrainingArguments):
     generate_during_eval: bool = False
     precompute_ref_log_probs: bool = False
     dataset_num_proc: Optional[int] = None
-    model_init_kwargs: Optional[Dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
+    model_init_kwargs: Optional[dict[str, Any]] = None
+    ref_model_init_kwargs: Optional[dict[str, Any]] = None
     model_adapter_name: Optional[str] = None
     ref_adapter_name: Optional[str] = None
     reference_free: bool = False
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 1856e1899a..4be0665456 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -22,7 +22,7 @@
 from contextlib import contextmanager, nullcontext
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.amp as amp
@@ -118,7 +118,7 @@ class PreferenceCollator(DataCollatorMixin):
     pad_token_id: int
     return_tensors: str = "pt"
 
-    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
         # Convert to tensor
         prompt_input_ids = [torch.tensor(example["prompt_input_ids"]) for example in examples]
         prompt_attention_mask = [torch.ones_like(input_ids) for input_ids in prompt_input_ids]
@@ -173,16 +173,16 @@ class DPOTrainer(Trainer):
             This supercedes the `tokenizer` argument, which is now deprecated.
         model_init (`Callable[[], transformers.PreTrainedModel]`):
             The model initializer to use for training. If None is specified, the default model initializer will be used.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
-        peft_config (`Dict`, defaults to `None`):
+        peft_config (`dict`, defaults to `None`):
             The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
     """
 
@@ -198,16 +198,16 @@ def __init__(
         args: Optional[DPOConfig] = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        peft_config: Optional[Dict] = None,
+        peft_config: Optional[dict] = None,
     ):
         if not isinstance(model, str) and ref_model is model:
             raise ValueError(
@@ -553,7 +553,7 @@ def tokenize_row(features, processing_class, max_prompt_length, max_completion_l
         Tokenize a row of the dataset.
 
         Args:
-            features (`Dict[str, str]`):
+            features (`dict[str, str]`):
                 Row of the dataset, should contain the keys `"prompt"`, `"chosen"`, and `"rejected"`.
             processing_class (`PreTrainedTokenizerBase`):
                 Processing class used to process the data.
@@ -567,7 +567,7 @@ def tokenize_row(features, processing_class, max_prompt_length, max_completion_l
                 completion sequences will have an eos token appended.
 
         Returns:
-            `Dict[str, List[int]]`:
+            `dict[str, list[int]]`:
                 Tokenized sequences with the keys `"prompt_input_ids"`, `"chosen_input_ids"`, and
                 `"rejected_input_ids".
 
@@ -794,7 +794,7 @@ def null_ref_context(self):
             if self.ref_adapter_name:
                 self.model.set_adapter(self.model_adapter_name or "default")
 
-    def compute_ref_log_probs(self, batch: Dict[str, torch.LongTensor]) -> Dict:
+    def compute_ref_log_probs(self, batch: dict[str, torch.LongTensor]) -> dict:
         """Computes log probabilities of the reference model for a single padded batch of a DPO specific dataset."""
         compte_ref_context_manager = amp.autocast("cuda") if self._peft_has_been_casted_to_bf16 else nullcontext()
         with torch.no_grad(), compte_ref_context_manager:
@@ -807,14 +807,14 @@ def compute_ref_log_probs(self, batch: Dict[str, torch.LongTensor]) -> Dict:
 
     @staticmethod
     def concatenated_inputs(
-        batch: Dict[str, Union[List, torch.LongTensor]], padding_value: int
-    ) -> Dict[str, torch.LongTensor]:
+        batch: dict[str, Union[list, torch.LongTensor]], padding_value: int
+    ) -> dict[str, torch.LongTensor]:
         """
         Concatenate the `chosen` and `rejected` inputs from the batch into a single tensor for both the prompt
         and completion sequences.
 
         Args:
-            batch (`Dict[str, Union[List, torch.LongTensor]]`):
+            batch (`dict[str, Union[list, torch.LongTensor]]`):
                 A batch of input data. The batch must contain the following keys:
 
                 - `"prompt_input_ids"`: Tensor of shape `(batch_size, prompt_length)` representing the prompt input IDs.
@@ -828,7 +828,7 @@ def concatenated_inputs(
                 `rejected_input_ids`).
 
         Returns:
-            `Dict[str, torch.LongTensor]`: A dictionary containing:
+            `dict[str, torch.LongTensor]`: A dictionary containing:
 
                 - `"prompt_input_ids"`: Concatenated prompt input IDs of shape `(2 * batch_size, prompt_length)`.
                 - `"completion_input_ids"`: Concatenated chosen and rejected completion input IDs of shape `(2 * batch_size, max_completion_length)`.
@@ -879,7 +879,7 @@ def dpo_loss(
         rejected_logps: torch.FloatTensor,
         ref_chosen_logps: torch.FloatTensor,
         ref_rejected_logps: torch.FloatTensor,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """
         Compute the DPO loss for a batch of policy and reference model log probabilities.
 
@@ -1060,7 +1060,7 @@ def dpo_loss(
 
         return losses, chosen_rewards, rejected_rewards
 
-    def concatenated_forward(self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]):
+    def concatenated_forward(self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]):
         """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
 
         We do this to avoid doing two forward passes, because it's faster for FSDP.
@@ -1202,7 +1202,7 @@ def concatenated_forward(self, model: nn.Module, batch: Dict[str, Union[List, to
     def get_batch_loss_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         train_eval: Literal["train", "eval"] = "train",
     ):
         """Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
@@ -1250,10 +1250,10 @@ def get_batch_loss_metrics(
     def compute_loss(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         return_outputs=False,
         num_items_in_batch=None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
         compute_loss_context_manager = amp.autocast("cuda") if self._peft_has_been_casted_to_bf16 else nullcontext()
         with compute_loss_context_manager:
             loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
@@ -1268,7 +1268,7 @@ def compute_loss(
 
         return loss
 
-    def generate_from_model_and_ref(self, model, batch: Dict[str, torch.LongTensor]) -> Tuple[str, str]:
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
         """Generate samples from the model and reference model for the given batch of inputs."""
 
         # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
@@ -1317,9 +1317,9 @@ def generate_from_model_and_ref(self, model, batch: Dict[str, torch.LongTensor])
     def prediction_step(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
     ):
         if ignore_keys is None:
             if hasattr(model, "config"):
@@ -1349,7 +1349,7 @@ def prediction_step(
 
         return (loss.detach(), logits, labels)
 
-    def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
         for key, value in metrics.items():
             self._stored_metrics[train_eval][key].append(value)
 
@@ -1358,7 +1358,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -1403,12 +1403,12 @@ def evaluation_loop(
 
         return initial_output
 
-    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training, including stored metrics.
 
         Args:
-            logs (`Dict[str, float]`):
+            logs (`dict[str, float]`):
                 The values to log.
             start_time (`float` or `None`, *optional*, defaults to `None`):
                 Start time of the training.
@@ -1429,7 +1429,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -1439,7 +1439,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/gkd_config.py b/trl/trainer/gkd_config.py
index 7230b29640..c826ceade2 100644
--- a/trl/trainer/gkd_config.py
+++ b/trl/trainer/gkd_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from .sft_config import SFTConfig
 
@@ -36,7 +36,7 @@ class GKDConfig(SFTConfig):
         teacher_model_name_or_path (`Optional[str]`, *optional*, defaults to `None`):
             Model name or path of the teacher model. If `None`, the teacher model will be the same as the model
             being trained.
-        teacher_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        teacher_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
             from a string.
         disable_dropout (`bool`, *optional*, defaults to `True`):
@@ -51,7 +51,7 @@ class GKDConfig(SFTConfig):
     beta: float = 0.5
     max_new_tokens: int = 128
     teacher_model_name_or_path: Optional[str] = None
-    teacher_model_init_kwargs: Optional[Dict[str, Any]] = None
+    teacher_model_init_kwargs: Optional[dict[str, Any]] = None
     disable_dropout: bool = True
     seq_kd: bool = False
 
diff --git a/trl/trainer/gkd_trainer.py b/trl/trainer/gkd_trainer.py
index f44335d197..fade180a4e 100644
--- a/trl/trainer/gkd_trainer.py
+++ b/trl/trainer/gkd_trainer.py
@@ -16,7 +16,7 @@
 import textwrap
 import warnings
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -68,14 +68,14 @@ def __init__(
         args: Optional[GKDConfig] = None,
         data_collator: Optional[DataCollator] = None,  # type: ignore
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
         peft_config: Optional["PeftConfig"] = None,
         formatting_func: Optional[Callable] = None,
@@ -273,7 +273,7 @@ def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=No
         return generated_tokens, new_attention_mask, new_labels
 
     def training_step(
-        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
     ) -> torch.Tensor:
         """
         Perform a training step for the Generalized Knowledge Distillation (GKD) model.
@@ -337,7 +337,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -347,7 +347,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/iterative_sft_trainer.py b/trl/trainer/iterative_sft_trainer.py
index 6e81a3586f..76891287c7 100644
--- a/trl/trainer/iterative_sft_trainer.py
+++ b/trl/trainer/iterative_sft_trainer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 import warnings
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 
 import torch
 from datasets import Dataset
@@ -60,7 +60,7 @@ class IterativeSFTTrainer(Trainer):
             Processing class used to process the data. If provided, will be used to automatically process the inputs
             for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
             reuse the fine-tuned model.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         data_collator (Union[DataCollatorForLanguageModeling, DataCollatorForSeq2Seq], *optional*):
             Data collator to be used for training and passed along the dataloader.
@@ -72,7 +72,7 @@ class IterativeSFTTrainer(Trainer):
             The truncation mode to use, either `keep_end` or `keep_start`.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to metric values.
         optimize_device_cache (`bool`, *optional*, defaults to `False`):
             Optimize CUDA cache for slightly more memory-efficient training.
@@ -87,16 +87,16 @@ def __init__(
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
             None,
             None,
         ),
         data_collator: Optional[DataCollator] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         max_length: Optional[int] = None,
         truncation_mode: Optional[str] = "keep_end",
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
         optimize_device_cache: Optional[bool] = False,
     ):
         # Step 0: check positional arguments validity
@@ -205,25 +205,25 @@ def prepare_model_inputs(self, input_ids: torch.Tensor, attention_mask: torch.Te
 
     @staticmethod
     def _step_safety_checker(
-        input_ids: List[torch.LongTensor],
-        attention_mask: List[torch.LongTensor],
-        labels: List[torch.LongTensor],
-        texts: List[str],
-        texts_labels: List[str],
+        input_ids: list[torch.LongTensor],
+        attention_mask: list[torch.LongTensor],
+        labels: list[torch.LongTensor],
+        texts: list[str],
+        texts_labels: list[str],
     ):
         """
         Check if the input data is valid for training.
 
         Args:
-            input_ids (List[`torch.LongTensor`]):
+            input_ids (list[`torch.LongTensor`]):
                 List of tensors containing the input_ids
-            attention_mask (List[`torch.LongTensor`]):
+            attention_mask (list[`torch.LongTensor`]):
                 List of tensors containing the attention_mask
-            labels (List[`torch.FloatTensor`]):
+            labels (list[`torch.FloatTensor`]):
                 List of tensors containing the labels
-            texts (List[`str`]):
+            texts (list[`str`]):
                 List of string containing the text input.
-            texts_labels (List[`str`]):
+            texts_labels (list[`str`]):
                 List of string containing the text labels.
 
         Returns:
@@ -260,24 +260,24 @@ def _step_safety_checker(
     @PPODecorators.empty_device_cache()
     def step(
         self,
-        input_ids: Optional[List[torch.LongTensor]] = None,
-        attention_mask: Optional[List[torch.LongTensor]] = None,
-        labels: Optional[List[torch.LongTensor]] = None,
-        texts: Optional[List[str]] = None,
-        texts_labels: Optional[List[str]] = None,
+        input_ids: Optional[list[torch.LongTensor]] = None,
+        attention_mask: Optional[list[torch.LongTensor]] = None,
+        labels: Optional[list[torch.LongTensor]] = None,
+        texts: Optional[list[str]] = None,
+        texts_labels: Optional[list[str]] = None,
     ):
         """
         Run an optimisation step given a list of input_ids, attention_mask, and labels or a list of text and text_labels.
         Args:
-            input_ids (List[`torch.LongTensor`]):
+            input_ids (list[`torch.LongTensor`]):
                 List of tensors containing the input_ids (if not provided, text will be used)
-            attention_mask (List[`torch.LongTensor`], , *optional*):
+            attention_mask (list[`torch.LongTensor`], , *optional*):
                 List of tensors containing the attention_mask
-            labels (List[`torch.FloatTensor`], *optional*):
+            labels (list[`torch.FloatTensor`], *optional*):
                 List of tensors containing the labels (if set to None, will default to input_ids)
-            texts (List[`str`], *optional*):
+            texts (list[`str`], *optional*):
                 List of strings containing the text input (if not provided, input_ids will directly be used)
-            texts_labels (List[`str`], *optional*):
+            texts_labels (list[`str`], *optional*):
                 List of strings containing the text labels (if set to None, will default to text)
 
         Returns:
@@ -384,7 +384,7 @@ def _maybe_log_save_evaluate(self):
         # check if logging is required
         if self.args.logging_steps is not None:
             if self.state.global_step % self.args.logging_steps == 0 and self.state.global_step != 0:
-                logs: Dict[str, float] = {}
+                logs: dict[str, float] = {}
 
                 tr_loss_scalar = self._nested_gather(self.tr_loss).mean().item()
 
@@ -402,7 +402,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -412,7 +412,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
index 7822a24d3a..71f86ef1b3 100644
--- a/trl/trainer/judges.py
+++ b/trl/trainer/judges.py
@@ -15,7 +15,7 @@
 import concurrent.futures
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 from accelerate import Accelerator
@@ -67,7 +67,7 @@ class BaseJudge(ABC):
     """
 
     @abstractmethod
-    def judge(self, prompts: List[str], completions: List[str], shuffle_order: bool = True) -> List:
+    def judge(self, prompts: list[str], completions: list[str], shuffle_order: bool = True) -> list:
         raise NotImplementedError("Judge subclasses must implement the `judge` method.")
 
 
@@ -90,20 +90,20 @@ def judge(self, prompts, completions, shuffle_order=True):
     """
 
     @abstractmethod
-    def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: bool = True) -> List[List[int]]:
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[list[int]]:
         """
         Judge the completion for the given prompts and return the ranks of each completion.
 
         Args:
-            prompts (`List[str]`):
+            prompts (`list[str]`):
                 List of prompts.
-            completions (`List[List[str]]`):
+            completions (`list[list[str]]`):
                 List of completions list, where each element is a list of completions for the corresponding prompt.
             shuffle_order (`bool`, *optional*, defaults to `True`):
                 Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            `List[List[int]]`:
+            `list[list[int]]`:
                 List of lists of idxs, where each list contains the ranks of the completions for the corresponding
                 prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
                 third, and then the first.
@@ -117,20 +117,20 @@ class BasePairwiseJudge(BaseJudge):
     """
 
     @abstractmethod
-    def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: bool = True) -> List[int]:
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
         """
         Judge the completion pairs for the given prompts.
 
         Args:
-            prompts (`List[str]`):
+            prompts (`list[str]`):
                 List of prompts.
-            completions (`List[List[str]]`):
+            completions (`list[list[str]]`):
                 List of completions pairs, where each element is a pair of completions for the corresponding prompt.
             shuffle_order (`bool`, *optional*, defaults to `True`):
                 Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            `List[int]`:
+            `list[int]`:
                 List of idxs, where each idx is the rank of the best completion for the corresponding prompt.
                 E.g., `1` means that the second completion (`idx=1`) is the best.
 
@@ -151,11 +151,11 @@ class BaseBinaryJudge(BaseJudge):
     @abstractmethod
     def judge(
         self,
-        prompts: List[str],
-        completions: List[str],
-        gold_completions: Optional[List[str]] = None,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: Optional[list[str]] = None,
         shuffle_order: bool = True,
-    ) -> List[int]:
+    ) -> list[int]:
         """
         Judge the completion for a given prompt. Used to assess if a completion satisfies a constraint.
 
@@ -164,13 +164,13 @@ def judge(
         It is relevant for assessing whether or not a prompt completion pair satisfies a specific contraint.
 
         Args:
-            prompts (`List[str]`): List of prompts.
-            completions (`List[str]`): List of completions.
-            gold_completions (`List[str]`, `optional`): List of gold completions if it exists.
+            prompts (`list[str]`): List of prompts.
+            completions (`list[str]`): List of completions.
+            gold_completions (`list[str]`, `optional`): List of gold completions if it exists.
             shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            List[int]: A list of binary labels:
+            list[int]: A list of binary labels:
                 - 1 indicates that the completion satisfies the evaluated constraint.
                 - 0 indicates that the completion does not satisfy the evaluated constraint.
 
@@ -219,19 +219,19 @@ def __init__(self):
 
     def judge(
         self,
-        prompts: List[str],
-        completions: List[List[str]],
+        prompts: list[str],
+        completions: list[list[str]],
         shuffle_order: bool = True,
         return_scores: bool = False,
         temperature: float = 1.0,
-    ) -> List[Union[int, float]]:
+    ) -> list[Union[int, float]]:
         """
         Judge the completion pairs for the given prompts using the PairRM model.
 
         Args:
-            prompts (`List[str]`):
+            prompts (`list[str]`):
                 List of prompts to judge.
-            completions (`List[List[str]]`):
+            completions (`list[list[str]]`):
                 List of completion pairs for each prompt.
             shuffle_order (`bool`, *optional*, defaults to `True`):
                 Whether to shuffle the order of the completions to avoid positional bias.
@@ -241,7 +241,7 @@ def judge(
                 Temperature for scaling logits if `return_scores` is True.
 
         Returns:
-            `Union[List[int, float]]`:
+            `Union[list[int, float]]`:
                 If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
                 completion is preferred.
                 If `return_scores` is `True`, returns softmax probabilities for the first completion.
@@ -311,7 +311,7 @@ def __init__(
         self.client = InferenceClient(model=model, token=token)
         self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
 
-    def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: bool = True) -> List[int]:
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
         # Shuffle the order of the completions to avoid positional bias
         if shuffle_order:
             flip_mask = np.random.choice([True, False], size=len(prompts))
@@ -370,7 +370,7 @@ def __init__(
         self.num_requests = 0
         self._warned = False
 
-    def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: bool = True) -> List[int]:
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
         # Check if the limit of requests is reached, if so, use random choice instead
         if self.max_requests is not None and self.num_requests >= self.max_requests:
             if not self._warned:  # Print the warning only once
@@ -423,19 +423,19 @@ class AllTrueJudge(BaseBinaryJudge):
     Implements the Mixture of Judges as described in the [CGPO paper](https://huggingface.co/papers/2409.20370).
 
     Args:
-    judges (`List[BaseBinaryJudge]`): A list of [`BaseBinaryJudge`] instances whose decisions will be unified.
+    judges (`list[BaseBinaryJudge]`): A list of [`BaseBinaryJudge`] instances whose decisions will be unified.
     """
 
-    def __init__(self, judges: List[BaseBinaryJudge]):
+    def __init__(self, judges: list[BaseBinaryJudge]):
         self.judges = judges
 
     def judge(
         self,
-        prompts: List[str],
-        completions: List[str],
-        gold_completions: Optional[List[str]] = None,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: Optional[list[str]] = None,
         shuffle_order: bool = True,
-    ) -> List[int]:
+    ) -> list[int]:
         all_binary_judgments = [
             judge.judge(prompts, completions, gold_completions, shuffle_order) for judge in self.judges
         ]
diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py
index 3acca53bde..351885c362 100644
--- a/trl/trainer/kto_config.py
+++ b/trl/trainer/kto_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -67,10 +67,10 @@ class KTOConfig(TrainingArguments):
         precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
             Whether to precompute reference model log probabilities for training and evaluation datasets. This is
             useful when training without the reference model to reduce the total GPU memory needed.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
             from a string.
         dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`):
@@ -94,6 +94,6 @@ class KTOConfig(TrainingArguments):
     is_encoder_decoder: Optional[bool] = None
     disable_dropout: bool = True
     precompute_ref_log_probs: bool = False
-    model_init_kwargs: Optional[Dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
+    model_init_kwargs: Optional[dict[str, Any]] = None
+    ref_model_init_kwargs: Optional[dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
diff --git a/trl/trainer/kto_trainer.py b/trl/trainer/kto_trainer.py
index 309a2c220a..2ef78b05f9 100644
--- a/trl/trainer/kto_trainer.py
+++ b/trl/trainer/kto_trainer.py
@@ -21,7 +21,7 @@
 from contextlib import contextmanager, nullcontext
 from copy import deepcopy
 from operator import itemgetter
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -79,7 +79,7 @@
 RUNNING_NAME = "running.pt"
 
 
-def _get_kl_dataset(batch: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+def _get_kl_dataset(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
     """
     Creates mismatched pairs of prompts and completions for the KL dataset by adding a +1 offset to the order of completions.
     For best results, the mismatched outputs y' used to estimate the KL term for a batch should be the same set as the matched
@@ -91,9 +91,9 @@ def _get_kl_dataset(batch: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 def _tokenize(
-    batch: Dict[str, List[Any]],
+    batch: dict[str, list[Any]],
     tokenizer: "PreTrainedTokenizer",
-) -> Dict[str, List[Any]]:
+) -> dict[str, list[Any]]:
     """Tokenize a batch from a KTO specific dataset."""
     prompt_tokenized = tokenizer(batch["prompt"], add_special_tokens=False)
     prompt_input_ids = prompt_tokenized["input_ids"]
@@ -148,7 +148,7 @@ def _tokenize(
     return output
 
 
-def _process_tokens(example: Dict[str, Any], model: "PreTrainedModel" = None, **kwargs) -> Dict:
+def _process_tokens(example: dict[str, Any], model: "PreTrainedModel" = None, **kwargs) -> dict:
     """Process tokens of a KTO specific dataset.
 
     At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
@@ -296,17 +296,17 @@ class KTOTrainer(Trainer):
             which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences.
         model_init (`Callable[[], transformers.PreTrainedModel]`):
             The model initializer to use for training. If None is specified, the default model initializer will be used.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
-        peft_config (`Dict`, defaults to `None`):
+        peft_config (`dict`, defaults to `None`):
             The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
         disable_dropout (`bool`, defaults to `True`):
             Whether or not to disable dropouts in `model` and `ref_model`.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
         model_adapter_name (`str`, defaults to `None`):
@@ -326,17 +326,17 @@ def __init__(
         ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
         args: KTOConfig = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         data_collator: Optional[DataCollator] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
         model_adapter_name: Optional[str] = None,
         ref_adapter_name: Optional[str] = None,
     ):
@@ -921,7 +921,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
 
         return super().get_eval_dataloader(eval_dataset=eval_dataset)
 
-    def compute_reference_log_probs(self, padded_batch: Dict) -> Dict:
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
         """Computes log probabilities of the reference model for a single padded batch of a KTO specific dataset."""
         with torch.no_grad():
             if self.ref_model is None:
@@ -1041,8 +1041,8 @@ def get_batch_logps(
             return (per_token_logps * loss_mask).sum(-1)
 
     def forward(
-        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         if self.calculate_KL:
             KL_logps = None
             KL_model_kwargs = (
@@ -1127,7 +1127,7 @@ def kto_loss(
         reference_chosen_logps: torch.FloatTensor,
         reference_rejected_logps: torch.FloatTensor,
         reference_KL_logps: torch.FloatTensor,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Compute the KTO loss for a batch of policy and reference model log probabilities.
 
         Args:
@@ -1194,7 +1194,7 @@ def kto_loss(
     def get_batch_loss_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
     ):
         """Compute the KTO loss and other metrics for the given batch of inputs for train or test."""
         metrics = {}
@@ -1279,10 +1279,10 @@ def get_batch_loss_metrics(
     def compute_loss(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         return_outputs=False,
         num_items_in_batch=None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
         if not self.use_dpo_data_collator:
             warnings.warn(
                 "compute_loss is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
@@ -1303,7 +1303,7 @@ def compute_loss(
             return (loss, metrics)
         return loss
 
-    def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
         for key, value in metrics.items():
             self._stored_metrics[train_eval][key].append(value)
 
@@ -1312,7 +1312,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
             return None
         return SequentialSampler(self.train_dataset)
 
-    def generate_from_model_and_ref(self, model, batch: Dict[str, torch.LongTensor]) -> Tuple[str, str]:
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
         """Generate samples from the model and reference model for the given batch of inputs."""
 
         # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
@@ -1361,9 +1361,9 @@ def generate_from_model_and_ref(self, model, batch: Dict[str, torch.LongTensor])
     def prediction_step(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
     ):
         if not self.use_dpo_data_collator:
             warnings.warn(
@@ -1404,7 +1404,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -1455,12 +1455,12 @@ def evaluation_loop(
 
         return initial_output
 
-    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training, including stored metrics.
 
         Args:
-            logs (`Dict[str, float]`):
+            logs (`dict[str, float]`):
                 The values to log.
             start_time (`float` or `None`, *optional*, defaults to `None`):
                 Start time of the training.
@@ -1498,7 +1498,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -1508,7 +1508,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index 5518d1979e..7301d10213 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import List, Literal, Optional
+from typing import Literal, Optional
 
 
 @dataclass
@@ -53,9 +53,9 @@ class ModelConfig:
             LoRA alpha.
         lora_dropout (`float`, *optional*, defaults to `0.05`):
             LoRA dropout.
-        lora_target_modules (`Optional[Union[str, List[str]]]`, *optional*, defaults to `None`):
+        lora_target_modules (`Optional[Union[str, list[str]]]`, *optional*, defaults to `None`):
             LoRA target modules.
-        lora_modules_to_save (`Optional[List[str]]`, *optional*, defaults to `None`):
+        lora_modules_to_save (`Optional[list[str]]`, *optional*, defaults to `None`):
             Model layers to unfreeze & train.
         lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`):
             Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling).
@@ -81,8 +81,8 @@ class ModelConfig:
     lora_r: int = 16
     lora_alpha: int = 32
     lora_dropout: float = 0.05
-    lora_target_modules: Optional[List[str]] = None
-    lora_modules_to_save: Optional[List[str]] = None
+    lora_target_modules: Optional[list[str]] = None
+    lora_modules_to_save: Optional[list[str]] = None
     lora_task_type: str = "CAUSAL_LM"
     use_rslora: bool = False
     load_in_8bit: bool = False
diff --git a/trl/trainer/nash_md_config.py b/trl/trainer/nash_md_config.py
index 2c2089fed2..dadad01f03 100644
--- a/trl/trainer/nash_md_config.py
+++ b/trl/trainer/nash_md_config.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import List
 
 from trl.trainer.online_dpo_config import OnlineDPOConfig
 
@@ -32,7 +31,7 @@ class NashMDConfig(OnlineDPOConfig):
             epochs.
     """
 
-    mixture_coef: List[float] = field(default_factory=lambda: [0.5])
+    mixture_coef: list[float] = field(default_factory=lambda: [0.5])
 
     def __post_init__(self):
         super().__post_init__()
diff --git a/trl/trainer/nash_md_trainer.py b/trl/trainer/nash_md_trainer.py
index c998174765..a37f4bb170 100644
--- a/trl/trainer/nash_md_trainer.py
+++ b/trl/trainer/nash_md_trainer.py
@@ -14,7 +14,7 @@
 
 import os
 import textwrap
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import jinja2
 import torch
@@ -78,14 +78,14 @@ class NashMDTrainer(OnlineDPOTrainer):
             Processing class used to process the data. If provided, will be used to automatically process the inputs
             for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
             reuse the fine-tuned model.
-        peft_config (`Dict`):
+        peft_config (`dict`):
             The peft config to use for training.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
@@ -102,14 +102,14 @@ def __init__(
         args: Optional[NashMDConfig] = None,
         data_collator: Optional[Callable] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ) -> None:
         super().__init__(
@@ -372,7 +372,7 @@ def gather_mean(tensor):
         self.stats["mixture_coef"].append(self.mixture_coef)
 
     def training_step(
-        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
     ) -> torch.Tensor:
         model.train()
 
@@ -455,7 +455,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -465,7 +465,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index 16ae8105fc..34f8ec5837 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import List, Literal, Optional
+from typing import Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -66,7 +66,7 @@ class OnlineDPOConfig(TrainingArguments):
     max_new_tokens: int = 64
     temperature: float = 0.9
     missing_eos_penalty: Optional[float] = None
-    beta: List[float] = field(default_factory=lambda: [0.1])
+    beta: list[float] = field(default_factory=lambda: [0.1])
     loss_type: Literal["sigmoid", "ipo"] = "sigmoid"
     dataset_num_proc: Optional[int] = None
     disable_dropout: bool = True
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
index 7014ba6926..4dbd6a050c 100644
--- a/trl/trainer/online_dpo_trainer.py
+++ b/trl/trainer/online_dpo_trainer.py
@@ -16,7 +16,7 @@
 import textwrap
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import datasets
 import jinja2
@@ -112,14 +112,14 @@ class OnlineDPOTrainer(Trainer):
             Processing class used to process the data. If provided, will be used to automatically process the inputs
             for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
             reuse the fine-tuned model.
-        peft_config (`Dict`):
+        peft_config (`dict`):
             The peft config to use for training.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
@@ -139,15 +139,15 @@ def __init__(
         args: Optional[OnlineDPOConfig] = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset], "datasets.Dataset"]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         reward_processing_class: Optional[PreTrainedTokenizerBase] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ) -> None:
         if ref_model is model:
@@ -299,7 +299,7 @@ def beta(self):
             return self._beta
 
     @staticmethod
-    def tokenize_row(feature, is_encoder_decoder: bool, tokenizer: PreTrainedTokenizerBase) -> Dict[str, Any]:
+    def tokenize_row(feature, is_encoder_decoder: bool, tokenizer: PreTrainedTokenizerBase) -> dict[str, Any]:
         """Tokenize a single row from a DPO specific dataset."""
         if not is_encoder_decoder:
             batch = tokenizer(feature["prompt"], add_special_tokens=False)
@@ -388,7 +388,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None
         return self.accelerator.prepare(eval_dataloader)
 
     def training_step(
-        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
     ) -> torch.Tensor:
         model.train()
 
@@ -602,7 +602,7 @@ def training_step(
     # start_time defaults to None to allow compatibility with transformers<=4.46
     def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time=None):
         if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
-            logs: Dict[str, float] = {}
+            logs: dict[str, float] = {}
 
             # all_gather + mean() to get average loss over all processes
             tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
@@ -687,7 +687,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -697,7 +697,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py
index 8d2100a189..16aafbc787 100644
--- a/trl/trainer/orpo_config.py
+++ b/trl/trainer/orpo_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from transformers import TrainingArguments
 
@@ -55,7 +55,7 @@ class ORPOConfig(TrainingArguments):
         is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
             you need to specify if the model returned by the callable is an encoder-decoder model.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
         dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
@@ -73,5 +73,5 @@ class ORPOConfig(TrainingArguments):
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
-    model_init_kwargs: Optional[Dict[str, Any]] = None
+    model_init_kwargs: Optional[dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
diff --git a/trl/trainer/orpo_trainer.py b/trl/trainer/orpo_trainer.py
index 529baed769..3551a7960f 100644
--- a/trl/trainer/orpo_trainer.py
+++ b/trl/trainer/orpo_trainer.py
@@ -22,7 +22,7 @@
 from collections import defaultdict
 from contextlib import nullcontext
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -102,15 +102,15 @@ class ORPOTrainer(Trainer):
             reuse the fine-tuned model.
         model_init (`Callable[[], transformers.PreTrainedModel]`):
             The model initializer to use for training. If None is specified, the default model initializer will be used.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
-        peft_config (`Dict`, defaults to `None`):
+        peft_config (`dict`, defaults to `None`):
             The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
     """
@@ -126,16 +126,16 @@ def __init__(
         args: Optional[ORPOConfig] = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
     ):
         if args.model_init_kwargs is None:
             model_init_kwargs = {}
@@ -440,7 +440,7 @@ def build_tokenized_answer(self, prompt, answer):
             attention_mask=answer_attention_mask,
         )
 
-    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> Dict:
+    def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> dict:
         """Tokenize a single row from a ORPO specific dataset.
 
         At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
@@ -598,12 +598,12 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
 
     @staticmethod
     def concatenated_inputs(
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         is_encoder_decoder: bool = False,
         label_pad_token_id: int = -100,
         padding_value: int = 0,
         device: Optional[torch.device] = None,
-    ) -> Dict[str, torch.LongTensor]:
+    ) -> dict[str, torch.LongTensor]:
         """Concatenate the chosen and rejected inputs into a single tensor.
 
         Args:
@@ -662,7 +662,7 @@ def odds_ratio_loss(
         self,
         policy_chosen_logps: torch.FloatTensor,
         policy_rejected_logps: torch.FloatTensor,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.
 
         Args:
@@ -728,8 +728,8 @@ def get_batch_logps(
             return (per_token_logps * loss_mask).sum(-1)
 
     def concatenated_forward(
-        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
 
         We do this to avoid doing two forward passes, because it's faster for FSDP.
@@ -807,7 +807,7 @@ def cross_entropy_loss(logits, labels):
     def get_batch_loss_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         train_eval: Literal["train", "eval"] = "train",
     ):
         """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
@@ -856,10 +856,10 @@ def get_batch_loss_metrics(
     def compute_loss(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         return_outputs=False,
         num_items_in_batch=None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
         if not self.use_dpo_data_collator:
             warnings.warn(
                 "compute_loss is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
@@ -881,7 +881,7 @@ def compute_loss(
             return (loss, metrics)
         return loss
 
-    def generate_from_model(self, model, batch: Dict[str, torch.LongTensor]) -> str:
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
         """Generate samples from the model and reference model for the given batch of inputs."""
 
         # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
@@ -905,9 +905,9 @@ def generate_from_model(self, model, batch: Dict[str, torch.LongTensor]) -> str:
     def prediction_step(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
     ):
         if not self.use_dpo_data_collator:
             warnings.warn(
@@ -942,7 +942,7 @@ def prediction_step(
 
         return (loss.detach(), logits, labels)
 
-    def store_metrics(self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
         for key, value in metrics.items():
             self._stored_metrics[train_eval][key].append(value)
 
@@ -951,7 +951,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -994,12 +994,12 @@ def evaluation_loop(
 
         return initial_output
 
-    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training, including stored metrics.
 
         Args:
-            logs (`Dict[str, float]`):
+            logs (`dict[str, float]`):
                 The values to log.
             start_time (`float` or `None`, *optional*, defaults to `None`):
                 Start time of the training.
@@ -1043,7 +1043,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -1053,7 +1053,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
index 98cbd8ab6f..2b55e5797c 100644
--- a/trl/trainer/ppo_trainer.py
+++ b/trl/trainer/ppo_trainer.py
@@ -19,7 +19,7 @@
 import time
 from collections import defaultdict
 from contextlib import contextmanager, nullcontext
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -116,10 +116,10 @@ def __init__(
         train_dataset: Dataset,
         value_model: Optional[nn.Module] = None,
         data_collator: Optional[DataCollatorWithPadding] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         # less commonly used
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
         peft_config: Optional["PeftConfig"] = None,
     ) -> None:
         if ref_model is model:
@@ -733,7 +733,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -743,7 +743,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
index 5dd7c53efd..c76c2461ec 100644
--- a/trl/trainer/reward_trainer.py
+++ b/trl/trainer/reward_trainer.py
@@ -16,7 +16,7 @@
 import warnings
 from collections import defaultdict
 from dataclasses import FrozenInstanceError, replace
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import pandas as pd
 import torch
@@ -59,7 +59,7 @@
     import wandb
 
 
-def _tokenize(batch: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizerBase") -> Dict[str, List[Any]]:
+def _tokenize(batch: dict[str, list[Any]], tokenizer: "PreTrainedTokenizerBase") -> dict[str, list[Any]]:
     """Tokenize a batch from a reward modelling dataset."""
     new_examples = {
         "input_ids_chosen": [],
@@ -90,20 +90,20 @@ def __init__(
         args: Optional[RewardConfig] = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
             None,
             None,
         ),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
         max_length: Optional[int] = None,
-        peft_config: Optional[Dict] = None,
+        peft_config: Optional[dict] = None,
     ):
         """
         Initialize RewardTrainer.
@@ -126,15 +126,15 @@ def __init__(
                 reuse the fine-tuned model.
             model_init (`Callable[[], transformers.PreTrainedModel]`):
                 The model initializer to use for training. If None is specified, the default model initializer will be used.
-            compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to `compute_accuracy`):
+            compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`):
                 The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) will be used.
-            callbacks (`List[transformers.TrainerCallback]`):
+            callbacks (`list[transformers.TrainerCallback]`):
                 The callbacks to use for training.
-            optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
                 The optimizer and scheduler to use for training.
             preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
                 The function to use to preprocess the logits before computing the metrics.
-            peft_config (`Dict`, defaults to `None`):
+            peft_config (`dict`, defaults to `None`):
                 The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
         """
         if type(args) is TrainingArguments:
@@ -277,10 +277,10 @@ def __init__(
     def compute_loss(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         return_outputs=False,
         num_items_in_batch=None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]:
         if not self.use_reward_data_collator:
             warnings.warn(
                 "The current compute_loss is implemented for RewardDataCollatorWithPadding,"
@@ -316,10 +316,10 @@ def compute_loss(
     def prediction_step(
         self,
         model: Union[PreTrainedModel, nn.Module],
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        ignore_keys: Optional[list[str]] = None,
+    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         inputs = self._prepare_inputs(inputs)
         if ignore_keys is None:
             if hasattr(self.model, "config"):
@@ -384,7 +384,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -394,7 +394,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
index ea8d4cf713..106426073f 100644
--- a/trl/trainer/rloo_trainer.py
+++ b/trl/trainer/rloo_trainer.py
@@ -18,7 +18,7 @@
 import textwrap
 import time
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -86,10 +86,10 @@ def __init__(
         reward_model: nn.Module,
         train_dataset: Dataset,
         data_collator: Optional[DataCollatorWithPadding] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         # less commonly used
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        callbacks: Optional[list[TrainerCallback]] = None,
     ) -> None:
         if ref_policy is policy:
             raise ValueError(
@@ -560,7 +560,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -570,7 +570,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
index a407a01ee6..eefc22b267 100644
--- a/trl/trainer/sft_config.py
+++ b/trl/trainer/sft_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from transformers import TrainingArguments
 
@@ -42,10 +42,10 @@ class SFTConfig(TrainingArguments):
         dataset_batch_size (`Union[int, None]`, *optional*, defaults to `1000`):
             Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is None`,
             tokenizes the full dataset as a single batch.
-        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        dataset_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+        dataset_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
             Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets.
         eval_packing (`Optional[bool]`, *optional*, defaults to `None`):
             Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
@@ -64,8 +64,8 @@ class SFTConfig(TrainingArguments):
     max_seq_length: Optional[int] = None
     dataset_num_proc: Optional[int] = None
     dataset_batch_size: int = 1000
-    model_init_kwargs: Optional[Dict[str, Any]] = None
-    dataset_kwargs: Optional[Dict[str, Any]] = None
+    model_init_kwargs: Optional[dict[str, Any]] = None
+    dataset_kwargs: Optional[dict[str, Any]] = None
     eval_packing: Optional[bool] = None
     num_of_sequences: int = 1024
     chars_per_token: float = 3.6
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
index 56c4ad0991..dfc85b4dde 100644
--- a/trl/trainer/sft_trainer.py
+++ b/trl/trainer/sft_trainer.py
@@ -15,7 +15,7 @@
 import inspect
 import os
 import warnings
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 
 import datasets
 import torch
@@ -80,7 +80,7 @@ class SFTTrainer(Trainer):
             The data collator to use for training.
         train_dataset (`Optional[datasets.Dataset]`):
             The dataset to use for training. We recommend users to use `trl.trainer.ConstantLengthDataset` to create their dataset.
-        eval_dataset (Optional[Union[`datasets.Dataset`, Dict[`str`, `datasets.Dataset`]]]):
+        eval_dataset (Optional[Union[`datasets.Dataset`, dict[`str`, `datasets.Dataset`]]]):
             The dataset to use for evaluation. We recommend users to use `trl.trainer.ConstantLengthDataset` to create their dataset.
         processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
             Processing class used to process the data. If provided, will be used to automatically process the inputs
@@ -89,12 +89,12 @@ class SFTTrainer(Trainer):
             This supercedes the `tokenizer` argument, which is now deprecated.
         model_init (`Callable[[], transformers.PreTrainedModel]`):
             The model initializer to use for training. If None is specified, the default model initializer will be used.
-        compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to None):
+        compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to None):
             The function used to compute metrics during evaluation. It should return a dictionary mapping metric names to metric values.
             If not specified, only the loss will be computed during evaluation.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
@@ -115,14 +115,14 @@ def __init__(
         args: Optional[SFTConfig] = None,
         data_collator: Optional[DataCollator] = None,  # type: ignore
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
         peft_config: Optional["PeftConfig"] = None,
         formatting_func: Optional[Callable] = None,
@@ -509,7 +509,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -519,7 +519,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index f2854a1cec..420e3abc78 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -19,7 +19,7 @@
 from collections import deque
 from dataclasses import dataclass
 from importlib.metadata import version
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Literal, Optional, Union
 
 import datasets
 import numpy as np
@@ -91,10 +91,10 @@ class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
     calculated on the completion made by the assistant.
 
     Args:
-        response_template (`Union[str, List[int]]`): the template form that indicates the start of the response, typically something like
+        response_template (`Union[str, list[int]]`): the template form that indicates the start of the response, typically something like
             '### Response:\n'. It can also be passed as tokenized ids, which can be useful when using a tokenizer that encodes the response
             differently if it does not have proper context.
-        instruction_template (`Union[str, List[int]]`): the template form that indicates the start of the human instruction, typically something like
+        instruction_template (`Union[str, list[int]]`): the template form that indicates the start of the human instruction, typically something like
             '### Human:\n'. Useful for assistant-style conversation datasets. It can also be passed as tokenized ids.
         mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying
             `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
@@ -105,8 +105,8 @@ class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
 
     def __init__(
         self,
-        response_template: Union[str, List[int]],
-        instruction_template: Optional[Union[str, List[int]]] = None,
+        response_template: Union[str, list[int]],
+        instruction_template: Optional[Union[str, list[int]]] = None,
         *args,
         mlm: bool = False,
         ignore_index: int = -100,
@@ -142,7 +142,7 @@ def __init__(
         self.ignore_index = ignore_index
         self.padding_free = padding_free
 
-    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
         batch = super().torch_call(examples)
 
         if self.instruction_template is None:
@@ -255,7 +255,7 @@ def __post_init__(self):
             # set a sensible default
             self.max_length = min(self.tokenizer.model_max_length, 1024)
 
-    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+    def __call__(self, examples: list[dict[str, Any]]) -> dict[str, torch.Tensor]:
         input_ids = []
         attention_mask = []
         prompts_input_ids = []
@@ -350,7 +350,7 @@ class RewardDataCollatorWithPadding:
     pad_to_multiple_of: Optional[int] = None
     return_tensors: str = "pt"
 
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
         features_chosen = []
         features_rejected = []
         margin = []
@@ -407,12 +407,12 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
         return batch
 
 
-def pad(tensors: List[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
+def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
     """
     Pads a list of tensors to the same shape along the first dimension.
 
     Args:
-        tensors (`List[torch.Tensor]`):
+        tensors (`list[torch.Tensor]`):
             List of input tensors to pad.
         padding_value (`int`):
             Value to use for padding. Default is 0.
@@ -474,7 +474,7 @@ class DPODataCollatorWithPadding:
     label_pad_token_id: int = -100
     is_encoder_decoder: Optional[bool] = False
 
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
         # first, pad everything to the same length
         padded_batch = {}
         for k in features[0].keys():
@@ -701,7 +701,7 @@ class RunningMoments:
     count: float = 1e-24
 
     @torch.no_grad()
-    def update(self, xs: torch.Tensor) -> Tuple[float, float]:
+    def update(self, xs: torch.Tensor) -> tuple[float, float]:
         """
         Updates running moments from batch's moments computed across ranks
         """
@@ -749,7 +749,7 @@ def load_from_json(cls, accelerator: Accelerator, json_path: str):
 @torch.no_grad()
 def get_global_statistics(
     accelerator, xs: torch.Tensor, mask=None, device="cpu"
-) -> Tuple[torch.Tensor, torch.Tensor, int]:
+) -> tuple[torch.Tensor, torch.Tensor, int]:
     """
     Computes element-wise mean and variance of the tensor across processes. Reference:
     https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L57C1-L73C75
@@ -767,7 +767,7 @@ def get_global_statistics(
     return global_mean.to(device), global_var.to(device), count.item()
 
 
-def compute_accuracy(eval_pred) -> Dict[str, float]:
+def compute_accuracy(eval_pred) -> dict[str, float]:
     predictions, labels = eval_pred
     # Here, predictions is rewards_chosen and rewards_rejected.
     # We want to see how much of the time rewards_chosen > rewards_rejected.
@@ -898,7 +898,7 @@ def get_quantization_config(model_config: ModelConfig) -> Optional[BitsAndBytesC
     return quantization_config
 
 
-def get_kbit_device_map() -> Optional[Dict[str, int]]:
+def get_kbit_device_map() -> Optional[dict[str, int]]:
     if is_torch_xpu_available():
         return {"": f"xpu:{PartialState().local_process_index}"}
     elif torch.cuda.is_available():
@@ -1080,7 +1080,7 @@ def first_true_indices(bools: torch.Tensor, dtype=torch.long):
 
 def get_reward(
     model: torch.nn.Module, query_responses: torch.Tensor, pad_token_id: int, context_length: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Computes the reward logits and the rewards for a given model and query responses.
 
@@ -1239,7 +1239,7 @@ def truncate_response(stop_token_id: int, pad_token_id: int, responses: torch.Te
 
 def generate(
     lm_backbone: torch.nn.Module, queries: torch.Tensor, pad_token_id: int, generation_config: GenerationConfig
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Generates sequences from the language model backbone in a way that does not affect padding tokens.
 
@@ -1312,11 +1312,11 @@ def batch_generation(
 def add_bos_token_if_needed(
     bos_token_id: Optional[int],
     prompt_len_input_ids: int,
-    prompt_tokens: Dict[str, List[int]],
+    prompt_tokens: dict[str, list[int]],
     chosen_prompt_len_input_ids: int,
-    chosen_tokens: Dict[str, List[int]],
+    chosen_tokens: dict[str, list[int]],
     rejected_prompt_len_input_ids: int,
-    rejected_tokens: Dict[str, List[int]],
+    rejected_tokens: dict[str, list[int]],
 ):
     if bos_token_id is not None:
         if prompt_len_input_ids == 0 or bos_token_id != prompt_tokens["prompt_input_ids"][0]:
@@ -1332,7 +1332,7 @@ def add_bos_token_if_needed(
 
 
 def add_eos_token_if_needed(
-    eos_token_id: int, chosen_tokens: Dict[str, List[int]], rejected_tokens: Dict[str, List[int]]
+    eos_token_id: int, chosen_tokens: dict[str, list[int]], rejected_tokens: dict[str, list[int]]
 ):
     if len(chosen_tokens["input_ids"]) == 0 or eos_token_id != chosen_tokens["input_ids"][-1]:
         chosen_tokens["input_ids"].append(eos_token_id)
@@ -1345,7 +1345,7 @@ def add_eos_token_if_needed(
 
 def truncate_right(
     input_ids: torch.Tensor, stop_token_id: int, pad_token_id: int
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Truncates the input tensor from the right side after the first occurrence of the stop token.
 
@@ -1390,7 +1390,7 @@ def empty_cache() -> None:
         torch.cuda.empty_cache()
 
 
-def decode_and_strip_padding(inputs: torch.Tensor, tokenizer: PreTrainedTokenizerBase) -> List[str]:
+def decode_and_strip_padding(inputs: torch.Tensor, tokenizer: PreTrainedTokenizerBase) -> list[str]:
     """
     Decodes the input tensor and strips the padding tokens.
 
@@ -1401,7 +1401,7 @@ def decode_and_strip_padding(inputs: torch.Tensor, tokenizer: PreTrainedTokenize
             The tokenizer used to decode the input tensor.
 
     Returns:
-        `List[str]`:
+        `list[str]`:
             The list of decoded strings with padding tokens stripped.
     """
     decoded = tokenizer.batch_decode(inputs, skip_special_tokens=False)
@@ -1413,7 +1413,7 @@ def generate_model_card(
     model_name: str,
     hub_model_id: str,
     dataset_name: Optional[str],
-    tags: List[str],
+    tags: list[str],
     wandb_url: Optional[str],
     trainer_name: str,
     trainer_citation: Optional[str] = None,
@@ -1432,7 +1432,7 @@ def generate_model_card(
             Hub model ID as `username/model_id`.
         dataset_name (`str` or `None`):
             Dataset name.
-        tags (`List[str]`):
+        tags (`list[str]`):
             Tags.
         wandb_url (`str` or `None`):
             Weights & Biases run URL.
diff --git a/trl/trainer/xpo_config.py b/trl/trainer/xpo_config.py
index bd5a46def7..ffeacbb961 100644
--- a/trl/trainer/xpo_config.py
+++ b/trl/trainer/xpo_config.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import List
 
 from trl.trainer.online_dpo_config import OnlineDPOConfig
 
@@ -26,11 +25,11 @@ class XPOConfig(OnlineDPOConfig):
     Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
 
     Parameters:
-        alpha (`float` or `List[float]`, *optional*, defaults to `1e-5`):
+        alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
             Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch and the last alpha is used for the rest of the epochs.
     """
 
-    alpha: List[float] = field(default_factory=lambda: [1e-5])
+    alpha: list[float] = field(default_factory=lambda: [1e-5])
 
     def __post_init__(self):
         super().__post_init__()
diff --git a/trl/trainer/xpo_trainer.py b/trl/trainer/xpo_trainer.py
index 4ec501c7f0..baa580d136 100644
--- a/trl/trainer/xpo_trainer.py
+++ b/trl/trainer/xpo_trainer.py
@@ -14,7 +14,7 @@
 
 import os
 import textwrap
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import jinja2
 import torch
@@ -77,14 +77,14 @@ class XPOTrainer(OnlineDPOTrainer):
             Processing class used to process the data. If provided, will be used to automatically process the inputs
             for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
             reuse the fine-tuned model.
-        peft_config (`Dict`):
+        peft_config (`dict`):
             The peft config to use for training.
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function to use to compute the metrics. Must take a `EvalPrediction` and return
             a dictionary string to metric values.
-        callbacks (`List[transformers.TrainerCallback]`):
+        callbacks (`list[transformers.TrainerCallback]`):
             The callbacks to use for training.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
             The optimizer and scheduler to use for training.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
             The function to use to preprocess the logits before computing the metrics.
@@ -101,14 +101,14 @@ def __init__(
         args: Optional[XPOConfig] = None,
         data_collator: Optional[Callable] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
-        peft_config: Optional[Dict] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ) -> None:
         super().__init__(
@@ -420,7 +420,7 @@ def gather_mean(tensor):
         self.stats["beta"].append(self.beta)
 
     def training_step(
-        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
     ) -> torch.Tensor:
         model.train()
 
@@ -512,7 +512,7 @@ def create_model_card(
         self,
         model_name: Optional[str] = None,
         dataset_name: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -522,7 +522,7 @@ def create_model_card(
                 The name of the model.
             dataset_name (`str`, *optional*, defaults to `None`):
                 The name of the dataset used for training.
-            tags (`str`, `List[str]` or `None`, *optional*, defaults to `None`):
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                 Tags to be associated with the model card.
         """
         if not self.is_world_process_zero():