break(datasets) Drop support for Python 3.8 (#4213)

adap · Sep 16, 2024 · 13decda · 13decda
1 parent d94b22d
commit 13decda
Show file tree

Hide file tree

Showing 34 changed files with 170 additions and 179 deletions.
diff --git a/.github/workflows/datasets-e2e.yml b/.github/workflows/datasets-e2e.yml
@@ -45,7 +45,7 @@ jobs:
       - name: Bootstrap
         uses: ./.github/actions/bootstrap
         with:
-          python-version: 3.8
+          python-version: 3.9
       - name: Install dependencies
         run: python -m poetry install
       - name: Run tests

diff --git a/.github/workflows/datasets.yml b/.github/workflows/datasets.yml
@@ -37,7 +37,7 @@ jobs:
         # In case of a mismatch, the job has to download Python to install it.
         # Note: Due to a bug in actions/setup-python, we have to put "3.10" in
         # quotes as it will otherwise assume "3.1"
-        python: [3.8, 3.9, '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
 
     name: Python ${{ matrix.python }}
 

diff --git a/datasets/flwr_datasets/common/telemetry.py b/datasets/flwr_datasets/common/telemetry.py
@@ -25,7 +25,7 @@
 from concurrent.futures import Future, ThreadPoolExecutor
 from enum import Enum, auto
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Optional, Union, cast
 
 from flwr_datasets.common.version import package_name, package_version
 
@@ -114,7 +114,7 @@ class EventType(str, Enum):
     # The type signature is not compatible with mypy, pylint and flake8
     # so each of those needs to be disabled for this line.
     # pylint: disable-next=no-self-argument,arguments-differ,line-too-long
-    def _generate_next_value_(name: str, start: int, count: int, last_values: List[Any]) -> Any:  # type: ignore # noqa: E501
+    def _generate_next_value_(name: str, start: int, count: int, last_values: list[Any]) -> Any:  # type: ignore # noqa: E501
         return name
 
     PING = auto()
@@ -127,7 +127,7 @@ def _generate_next_value_(name: str, start: int, count: int, last_values: List[A
 
 # Use the ThreadPoolExecutor with max_workers=1 to have a queue
 # and also ensure that telemetry calls are not blocking.
-state: Dict[str, Union[Optional[str], Optional[ThreadPoolExecutor]]] = {
+state: dict[str, Union[Optional[str], Optional[ThreadPoolExecutor]]] = {
     # Will be assigned ThreadPoolExecutor(max_workers=1)
     # in event() the first time it's required
     "executor": None,
@@ -143,7 +143,7 @@ def _generate_next_value_(name: str, start: int, count: int, last_values: List[A
 # pylint: disable-next=unsubscriptable-object
 def event(
     event_type: EventType,
-    event_details: Optional[Dict[str, Any]] = None,
+    event_details: Optional[dict[str, Any]] = None,
 ) -> Future:  # type: ignore
     """Submit create_event to ThreadPoolExecutor to avoid blocking."""
     if state["executor"] is None:
@@ -155,7 +155,7 @@ def event(
     return result
 
 
-def create_event(event_type: EventType, event_details: Optional[Dict[str, Any]]) -> str:
+def create_event(event_type: EventType, event_details: Optional[dict[str, Any]]) -> str:
     """Create telemetry event."""
     if state["source"] is None:
         state["source"] = _get_source_id()

diff --git a/datasets/flwr_datasets/common/typing.py b/datasets/flwr_datasets/common/typing.py
@@ -15,12 +15,12 @@
 """Flower Datasets type definitions."""
 
 
-from typing import Any, List
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
 
 NDArray = npt.NDArray[Any]
 NDArrayInt = npt.NDArray[np.int_]
 NDArrayFloat = npt.NDArray[np.float_]
-NDArrays = List[NDArray]
+NDArrays = list[NDArray]
diff --git a/datasets/flwr_datasets/common/version.py b/datasets/flwr_datasets/common/version.py
@@ -19,15 +19,14 @@
 
 
 import importlib.metadata as importlib_metadata
-from typing import Tuple
 
 
-def _check_package(name: str) -> Tuple[str, str]:
+def _check_package(name: str) -> tuple[str, str]:
     version: str = importlib_metadata.version(name)
     return name, version
 
 
-def _version() -> Tuple[str, str]:
+def _version() -> tuple[str, str]:
     """Read and return Flower Dataset package name and version.
 
     Returns

diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py
@@ -17,7 +17,7 @@
 
 
 import unittest
-from typing import Dict, Union
+from typing import Union
 from unittest.mock import Mock, patch
 
 import numpy as np
@@ -385,7 +385,7 @@ def test_dict_of_partitioners_passes_partitioners(self) -> None:
         """Test if partitioners are passed directly (no recreation)."""
         num_train_partitions = 100
         num_test_partitions = 100
-        partitioners: Dict[str, Union[Partitioner, int]] = {
+        partitioners: dict[str, Union[Partitioner, int]] = {
             "train": IidPartitioner(num_partitions=num_train_partitions),
             "test": IidPartitioner(num_partitions=num_test_partitions),
         }
@@ -419,7 +419,7 @@ def test_mixed_type_partitioners_passes_instantiated_partitioners(self) -> None:
         """Test if an instantiated partitioner is passed directly."""
         num_train_partitions = 100
         num_test_partitions = 100
-        partitioners: Dict[str, Union[Partitioner, int]] = {
+        partitioners: dict[str, Union[Partitioner, int]] = {
             "train": IidPartitioner(num_partitions=num_train_partitions),
             "test": num_test_partitions,
         }
@@ -433,7 +433,7 @@ def test_mixed_type_partitioners_creates_from_int(self) -> None:
         """Test if an IidPartitioner partitioner is created."""
         num_train_partitions = 100
         num_test_partitions = 100
-        partitioners: Dict[str, Union[Partitioner, int]] = {
+        partitioners: dict[str, Union[Partitioner, int]] = {
             "train": IidPartitioner(num_partitions=num_train_partitions),
             "test": num_test_partitions,
         }

diff --git a/datasets/flwr_datasets/metrics/utils.py b/datasets/flwr_datasets/metrics/utils.py
@@ -16,7 +16,7 @@
 
 
 import warnings
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import pandas as pd
 
@@ -206,7 +206,7 @@ def compute_frequencies(
 
 
 def _compute_counts(
-    labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
+    labels: Union[list[int], list[str]], unique_labels: Union[list[int], list[str]]
 ) -> pd.Series:
     """Compute the count of labels when taking into account all possible labels.
 
@@ -237,7 +237,7 @@ def _compute_counts(
 
 
 def _compute_frequencies(
-    labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
+    labels: Union[list[int], list[str]], unique_labels: Union[list[int], list[str]]
 ) -> pd.Series:
     """Compute the distribution of labels when taking into account all possible labels.
 

diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py
@@ -19,7 +19,7 @@
 import random
 import string
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 from PIL import Image
@@ -30,7 +30,7 @@
 
 def _generate_artificial_strings(
     num_rows: int, num_unique: int, string_length: int, seed: int = 42
-) -> List[str]:
+) -> list[str]:
     """Create list of strings for categories or labels mocking.
 
     Note to keep the seed the same if you reuse this function for in creation of the
@@ -53,7 +53,7 @@ def _generate_artificial_strings(
         List of generated strings.
     """
     random.seed(seed)
-    unique_strings: Set[str] = set()
+    unique_strings: set[str] = set()
     while len(unique_strings) < num_unique:
         random_str = "".join(
             random.choices(string.ascii_letters + string.digits, k=string_length)
@@ -68,7 +68,7 @@ def _generate_artificial_strings(
     return artificial_column
 
 
-def _generate_artificial_categories(num_rows: int, choices: List[Any]) -> List[str]:
+def _generate_artificial_categories(num_rows: int, choices: list[Any]) -> list[str]:
     """Create list of strings from given `choices` list."""
     artificial_column = choices.copy()
     remaining_to_allocate = num_rows - len(choices)
@@ -82,7 +82,7 @@ def _generate_random_word(length: int) -> str:
     return "".join(random.choices(string.ascii_letters, k=length))
 
 
-def _generate_random_text_column(num_rows: int, length: int) -> List[str]:
+def _generate_random_text_column(num_rows: int, length: int) -> list[str]:
     """Generate a list of random text of specified length."""
     text_col = []
     for _ in range(num_rows):
@@ -98,7 +98,7 @@ def _generate_random_sentence(
 ) -> str:
     """Generate a random sentence with words of random lengths."""
     sentence_length = random.randint(min_sentence_length, max_sentence_length)
-    sentence: List[str] = []
+    sentence: list[str] = []
     while len(" ".join(sentence)) < sentence_length:
         word_length = random.randint(min_word_length, max_word_length)
         word = _generate_random_word(word_length)
@@ -112,7 +112,7 @@ def _generate_random_sentences(
     max_word_length: int,
     min_sentence_length: int,
     max_sentence_length: int,
-) -> List[str]:
+) -> list[str]:
     """Generate a list of random sentences."""
     text_col = [
         _generate_random_sentence(
@@ -123,7 +123,7 @@ def _generate_random_sentences(
     return text_col
 
 
-def _make_num_rows_none(column: List[Any], num_none: int) -> List[Any]:
+def _make_num_rows_none(column: list[Any], num_none: int) -> list[Any]:
     """Assign none num_none times to the given list."""
     column_copy = column.copy()
     none_positions = random.sample(range(len(column_copy)), num_none)
@@ -154,29 +154,29 @@ def _generate_random_date_column(
     end_date: datetime,
     date_format: str = "%a %b %d %H:%M:%S %Y",
     as_string: bool = True,
-) -> List[Union[str, datetime]]:
+) -> list[Union[str, datetime]]:
     """Generate a list of random dates."""
     return [
         _generate_random_date(start_date, end_date, date_format, as_string)
         for _ in range(num_rows)
     ]
 
 
-def _generate_random_int_column(num_rows: int, min_int: int, max_int: int) -> List[int]:
+def _generate_random_int_column(num_rows: int, min_int: int, max_int: int) -> list[int]:
     """Generate a list of ints."""
     return [random.randint(min_int, max_int) for _ in range(num_rows)]
 
 
-def _generate_random_bool_column(num_rows: int) -> List[bool]:
+def _generate_random_bool_column(num_rows: int) -> list[bool]:
     """Generate a list of bools."""
     return [random.choice([True, False]) for _ in range(num_rows)]
 
 
 def _generate_random_image_column(
     num_rows: int,
-    image_size: Union[Tuple[int, int], Tuple[int, int, int]],
+    image_size: Union[tuple[int, int], tuple[int, int, int]],
     simulate_type: str,
-) -> List[Any]:
+) -> list[Any]:
     """Simulate the images with the format that is found in HF Hub.
 
     Directly using `Image.fromarray` does not work because it creates `PIL.Image.Image`.
@@ -207,7 +207,7 @@ def generate_random_audio_column(
     num_rows: int,
     sampling_rate: int,
     length_in_samples: int,
-) -> List[Dict[str, Any]]:
+) -> list[dict[str, Any]]:
     """Simulate the audio column.
 
     Audio column in the datset is comprised from an array or floats, sample_rate and a
@@ -365,8 +365,8 @@ def _mock_speach_commands(num_rows: int) -> Dataset:
 
 def _load_mocked_dataset(
     dataset_name: str,
-    num_rows: List[int],
-    split_names: List[str],
+    num_rows: list[int],
+    split_names: list[str],
     subset: str = "",
 ) -> DatasetDict:
     dataset_dict = {}
@@ -380,7 +380,7 @@ def _load_mocked_dataset(
 def _load_mocked_dataset_by_partial_download(
     dataset_name: str,
     split_name: str,
-    skip_take_list: List[Tuple[int, int]],
+    skip_take_list: list[tuple[int, int]],
     subset_name: Optional[str] = None,
 ) -> Dataset:
     """Download a partial dataset.
@@ -423,8 +423,8 @@ def _load_mocked_dataset_by_partial_download(
 
 def _load_mocked_dataset_dict_by_partial_download(
     dataset_name: str,
-    split_names: List[str],
-    skip_take_lists: List[List[Tuple[int, int]]],
+    split_names: list[str],
+    skip_take_lists: list[list[tuple[int, int]]],
     subset_name: Optional[str] = None,
 ) -> DatasetDict:
     """Like _load_mocked_dataset_by_partial_download but for many splits."""

diff --git a/datasets/flwr_datasets/partitioner/dirichlet_partitioner.py b/datasets/flwr_datasets/partitioner/dirichlet_partitioner.py
@@ -16,7 +16,7 @@
 
 
 import warnings
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 
@@ -89,7 +89,7 @@ def __init__(  # pylint: disable=R0913
         self,
         num_partitions: int,
         partition_by: str,
-        alpha: Union[int, float, List[float], NDArrayFloat],
+        alpha: Union[int, float, list[float], NDArrayFloat],
         min_partition_size: int = 10,
         self_balancing: bool = False,
         shuffle: bool = True,
@@ -110,8 +110,8 @@ def __init__(  # pylint: disable=R0913
         # Utility attributes
         # The attributes below are determined during the first call to load_partition
         self._avg_num_of_samples_per_partition: Optional[float] = None
-        self._unique_classes: Optional[Union[List[int], List[str]]] = None
-        self._partition_id_to_indices: Dict[int, List[int]] = {}
+        self._unique_classes: Optional[Union[list[int], list[str]]] = None
+        self._partition_id_to_indices: dict[int, list[int]] = {}
         self._partition_id_to_indices_determined = False
 
     def load_partition(self, partition_id: int) -> datasets.Dataset:
@@ -142,7 +142,7 @@ def num_partitions(self) -> int:
         return self._num_partitions
 
     def _initialize_alpha(
-        self, alpha: Union[int, float, List[float], NDArrayFloat]
+        self, alpha: Union[int, float, list[float], NDArrayFloat]
     ) -> NDArrayFloat:
         """Convert alpha to the used format in the code a NDArrayFloat.
 
@@ -164,7 +164,7 @@ def _initialize_alpha(
             alpha = np.array([float(alpha)], dtype=float).repeat(self._num_partitions)
         elif isinstance(alpha, float):
             alpha = np.array([alpha], dtype=float).repeat(self._num_partitions)
-        elif isinstance(alpha, List):
+        elif isinstance(alpha, list):
             if len(alpha) != self._num_partitions:
                 raise ValueError(
                     "If passing alpha as a List, it needs to be of length of equal to "
@@ -217,7 +217,7 @@ def _determine_partition_id_to_indices_if_needed(
         sampling_try = 0
         while True:
             # Prepare data structure to store indices assigned to partition ids
-            partition_id_to_indices: Dict[int, List[int]] = {}
+            partition_id_to_indices: dict[int, list[int]] = {}
             for nid in range(self._num_partitions):
                 partition_id_to_indices[nid] = []
 

diff --git a/datasets/flwr_datasets/partitioner/dirichlet_partitioner_test.py b/datasets/flwr_datasets/partitioner/dirichlet_partitioner_test.py
@@ -17,7 +17,7 @@
 
 # pylint: disable=W0212
 import unittest
-from typing import Tuple, Union
+from typing import Union
 
 import numpy as np
 from numpy.typing import NDArray
@@ -33,7 +33,7 @@ def _dummy_setup(
     num_rows: int,
     partition_by: str,
     self_balancing: bool = True,
-) -> Tuple[Dataset, DirichletPartitioner]:
+) -> tuple[Dataset, DirichletPartitioner]:
     """Create a dummy dataset and partitioner for testing."""
     data = {
         partition_by: [i % 3 for i in range(num_rows)],

diff --git a/datasets/flwr_datasets/partitioner/distribution_partitioner.py b/datasets/flwr_datasets/partitioner/distribution_partitioner.py
@@ -16,7 +16,7 @@
 
 
 from collections import Counter
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 
@@ -182,7 +182,7 @@ def __init__(  # pylint: disable=R0913
         self._num_unique_labels: int = 0
         self._num_columns: int = 0
         self._partition_id_to_indices_determined = False
-        self._partition_id_to_indices: Dict[int, List[int]] = {}
+        self._partition_id_to_indices: dict[int, list[int]] = {}
 
     def load_partition(self, partition_id: int) -> datasets.Dataset:
         """Load a partition based on the partition index.