moj-analytical-services · RobinL · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -1,44 +1,32 @@
-name: Lint
-on: [pull_request]
-
-env:
-  PYTHON_VERSION: "3.12.1"
+name: autoruff
+on:
+  pull_request:
+    branches:
+      - master
+      - "**dev"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
 
 jobs:
-  build:
+  lint:
     runs-on: ubuntu-latest
-
+    name: Lint with Ruff using Python 3.12.1
     steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
+      - uses: actions/checkout@v4
 
-      - name: Load cached Poetry installation
-        uses: actions/cache@v2
-        with:
-          path: ~/.local  # the path depends on the OS
-          key: poetry-0  # increment to reset cache
-      - name: Install Poetry
-        uses: snok/install-poetry@v1
-        with:
-          version: '1.7.0'
-          virtualenvs-create: true
-          virtualenvs-in-project: true
-          installer-parallel: true
+      - name: Install poetry using pipx
+        run: pipx install poetry && pipx ensurepath
 
-      - name: Load cached venv
-        id: cached-poetry-dependencies
-        uses: actions/cache@v2
+      - uses: actions/setup-python@v5
         with:
-          path: .venv
-          key: venv-lint-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-00
-      - name: Install linting dependencies
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction --no-root --only linting
+          python-version: "3.12.1"
+          cache: "poetry"
 
-      - name: Lint Python files with ruff
+      - name: Install dependencies
         run: |
-          source .venv/bin/activate
-          ruff --show-source .
+          poetry config virtualenvs.in-project true
+          poetry install --no-interaction --no-root --only linting
+
+      - name: Run Ruff formatting
+        run: poetry run ruff check --output-format=full
diff --git a/docs/hooks/__init__.py b/docs/hooks/__init__.py
@@ -77,6 +77,7 @@ def re_route_links(markdown: str, page_title: str) -> str | None:
 
 # hooks for use by mkdocs
 
+
 # priority last - run this after any other such hooks
 # this ensures we are overwriting mknotebooks config,
 # not the other way round

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,8 +42,7 @@ igraph = ">=0.11.2"
 
 [tool.poetry.group.linting]
 [tool.poetry.group.linting.dependencies]
-black = "22.6.0"
-ruff = "0.0.257"
+ruff = "^0.4.2"
 
 [tool.poetry.group.testing]
 [tool.poetry.group.testing.dependencies]
@@ -73,7 +72,7 @@ profile = "black"
 
 [tool.ruff]
 line-length = 88
-select = [
+lint.select = [
     # Pyflakes
     "F",
     # Pycodestyle
@@ -86,7 +85,7 @@ select = [
     # flake8-print
     "T20"
 ]
-ignore = [
+lint.ignore = [
     "B905", # `zip()` without an explicit `strict=` parameter
     "B006", # Do not use mutable data structures for argument defaults"
 ]

diff --git a/splink/blocking_rule_creator_utils.py b/splink/blocking_rule_creator_utils.py
@@ -8,7 +8,7 @@
 
 
 def to_blocking_rule_creator(
-    blocking_rule_creator: Union[dict[str, Any], str, BlockingRuleCreator]
+    blocking_rule_creator: Union[dict[str, Any], str, BlockingRuleCreator],
 ):
     if isinstance(blocking_rule_creator, dict):
         return CustomRule(**blocking_rule_creator)

diff --git a/splink/charts.py b/splink/charts.py
@@ -268,7 +268,7 @@ def accuracy_chart(records, width=400, height=400, as_dict=False, add_metrics=[]
         "f2": "F2",
         "f0_5": "F0.5",
         "p4": "P4",
-        "phi": "\u03C6 (MCC)",
+        "phi": "\u03c6 (MCC)",
     }
     chart["transform"][2]["calculate"] = chart["transform"][2]["calculate"].replace(
         "__mapping__", str(mapping)
@@ -313,7 +313,7 @@ def threshold_selection_tool(records, as_dict=False, add_metrics=[]):
         "f2": "F2",
         "f0_5": "F0.5",
         "p4": "P4",
-        "phi": "\u03C6 (MCC)",
+        "phi": "\u03c6 (MCC)",
     }
     chart["hconcat"][1]["transform"][2]["calculate"] = chart["hconcat"][1]["transform"][
         2
@@ -380,32 +380,32 @@ def unlinkables_chart(
     unlinkables_chart_def["data"]["values"] = records
 
     if x_col == "match_probability":
-        unlinkables_chart_def["layer"][0]["encoding"]["x"][
-            "field"
-        ] = "match_probability"
-        unlinkables_chart_def["layer"][0]["encoding"]["x"]["axis"][
-            "title"
-        ] = "Threshold match probability"
+        unlinkables_chart_def["layer"][0]["encoding"]["x"]["field"] = (
+            "match_probability"
+        )
+        unlinkables_chart_def["layer"][0]["encoding"]["x"]["axis"]["title"] = (
+            "Threshold match probability"
+        )
         unlinkables_chart_def["layer"][0]["encoding"]["x"]["axis"]["format"] = ".2"
 
-        unlinkables_chart_def["layer"][1]["encoding"]["x"][
-            "field"
-        ] = "match_probability"
+        unlinkables_chart_def["layer"][1]["encoding"]["x"]["field"] = (
+            "match_probability"
+        )
         unlinkables_chart_def["layer"][1]["selection"]["selector112"]["fields"] = [
             "match_probability",
             "cum_prop",
         ]
 
-        unlinkables_chart_def["layer"][2]["encoding"]["x"][
-            "field"
-        ] = "match_probability"
-        unlinkables_chart_def["layer"][2]["encoding"]["x"]["axis"][
-            "title"
-        ] = "Threshold match probability"
+        unlinkables_chart_def["layer"][2]["encoding"]["x"]["field"] = (
+            "match_probability"
+        )
+        unlinkables_chart_def["layer"][2]["encoding"]["x"]["axis"]["title"] = (
+            "Threshold match probability"
+        )
 
-        unlinkables_chart_def["layer"][3]["encoding"]["x"][
-            "field"
-        ] = "match_probability"
+        unlinkables_chart_def["layer"][3]["encoding"]["x"]["field"] = (
+            "match_probability"
+        )
 
     if source_dataset:
         unlinkables_chart_def["title"]["text"] += f" - {source_dataset}"

diff --git a/splink/column_expression.py b/splink/column_expression.py
@@ -51,7 +51,7 @@ def _clone(self) -> "ColumnExpression":
 
     @staticmethod
     def instantiate_if_str(
-        str_or_column_expression: Union[str, "ColumnExpression"]
+        str_or_column_expression: Union[str, "ColumnExpression"],
     ) -> "ColumnExpression":
         if isinstance(str_or_column_expression, ColumnExpression):
             return str_or_column_expression

diff --git a/splink/comparison.py b/splink/comparison.py
@@ -62,7 +62,6 @@ def __init__(
         comparison_description: str = None,
         column_info_settings: ColumnInfoSettings = None,
     ):
-
         self.comparison_levels: list[ComparisonLevel] = comparison_levels
 
         self._column_info_settings: Optional[ColumnInfoSettings] = column_info_settings

diff --git a/splink/comparison_level.py b/splink/comparison_level.py
@@ -494,7 +494,6 @@ def _exact_match_colnames(self):
     def _u_probability_corresponding_to_exact_match(
         self, comparison_levels: list[ComparisonLevel]
     ):
-
         if self.disable_tf_exact_match_detection:
             return self.u_probability
 

diff --git a/splink/comparison_level_composition.py b/splink/comparison_level_composition.py
@@ -9,7 +9,7 @@
 
 
 def _ensure_is_comparison_level_creator(
-    cl: Union[ComparisonLevelCreator, dict[str, Any]]
+    cl: Union[ComparisonLevelCreator, dict[str, Any]],
 ) -> ComparisonLevelCreator:
     if isinstance(cl, dict):
         from .comparison_level_library import CustomLevel

diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py
@@ -584,7 +584,6 @@ def __init__(
         self.input_is_string = input_is_string
 
     def convert_time_metric_to_seconds(self, threshold: float, metric: str) -> float:
-
         conversion_factors = {
             "second": 1,
             "minute": 60,

diff --git a/splink/comparison_template_library.py b/splink/comparison_template_library.py
@@ -54,7 +54,6 @@ def __init__(
         separate_1st_january: bool = False,
         use_damerau_levenshtein: bool = True,
     ):
-
         date_thresholds_as_iterable = ensure_is_iterable(datetime_thresholds)
         self.datetime_thresholds = [*date_thresholds_as_iterable]
         date_metrics_as_iterable = ensure_is_iterable(datetime_metrics)
@@ -88,7 +87,6 @@ def datetime_parse_function(self):
         return self.col_expression.try_parse_date
 
     def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
-
         if self.invalid_dates_as_null:
             null_col = self.datetime_parse_function(self.datetime_format)
         else:
@@ -153,7 +151,6 @@ def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
         return levels
 
     def create_description(self) -> str:
-
         comparison_desc = "Exact match "
         if self.separate_1st_january:
             comparison_desc += "(with separate 1st Jan) "

diff --git a/splink/database_api.py b/splink/database_api.py
@@ -275,7 +275,6 @@ def register_multiple_tables(
     def register_table(
         self, input_table, table_name, overwrite=False
     ) -> SplinkDataFrame:
-
         tables_dict = self.register_multiple_tables(
             [input_table], [table_name], overwrite=overwrite
         )

diff --git a/splink/em_training_session.py b/splink/em_training_session.py
@@ -115,9 +115,9 @@ def __init__(
         cc_names_to_deactivate = [
             cc.output_column_name for cc in comparisons_to_deactivate
         ]
-        self._comparisons_that_cannot_be_estimated: list[
-            Comparison
-        ] = comparisons_to_deactivate
+        self._comparisons_that_cannot_be_estimated: list[Comparison] = (
+            comparisons_to_deactivate
+        )
 
         filtered_ccs = [
             cc
@@ -339,9 +339,9 @@ def _iteration_history_records(self):
             for r in records:
                 r["iteration"] = iteration
                 # TODO: why lambda from current settings, not history?
-                r[
-                    "probability_two_random_records_match"
-                ] = self.core_model_settings.probability_two_random_records_match
+                r["probability_two_random_records_match"] = (
+                    self.core_model_settings.probability_two_random_records_match
+                )
 
             output_records.extend(records)
         return output_records

diff --git a/splink/expectation_maximisation.py b/splink/expectation_maximisation.py
@@ -408,9 +408,9 @@ def _max_change_in_parameters_comparison_levels(
         max_change_levels["prev_comparison_level"] = None
         max_change_levels["current_comparison_level"] = None
         max_change_levels["max_change_type"] = "probability_two_random_records_match"
-        max_change_levels[
-            "max_change_value"
-        ] = change_probability_two_random_records_match
+        max_change_levels["max_change_value"] = (
+            change_probability_two_random_records_match
+        )
         max_change_levels["max_abs_change_value"] = abs(
             change_probability_two_random_records_match
         )

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -74,7 +74,6 @@ def _generate_blocking_rule(
     if len(cols_as_string) == 0:
         br: BlockingRuleCreator = CustomRule("1=1", linker._sql_dialect)
     else:
-
         br = block_on(*cols_as_string)
 
     return br.get_blocking_rule(linker._sql_dialect)

diff --git a/splink/find_matches_to_new_records.py b/splink/find_matches_to_new_records.py
@@ -19,7 +19,6 @@ def add_unique_id_and_source_dataset_cols_if_needed(
     # Add source dataset column to new records if required and not exists
     sds_sel_sql = ""
     if sds_col := linker._settings_obj.column_info_settings.source_dataset_column_name:
-
         if sds_col not in cols:
             sds_sel_sql = f", 'new_record' as {sds_col}"