moj-analytical-services · ADBond · Mar 14, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
@@ -0,0 +1,60 @@
+name: Type hinting with mypy
+on:
+  pull_request:
+    branches:
+      - master
+      - '**dev'
+    paths:
+      - splink/**
+      - tests/**
+      - pyproject.toml
+
+jobs:
+  mypy:
+    runs-on: ubuntu-20.04
+    name: Check type hinting with mypy
+    steps:
+      #----------------------------------------------
+      #       check-out repo and set-up python
+      #----------------------------------------------
+      - name: Check out repository
+        uses: actions/checkout@v3
+      - name: Set up python
+        id: setup-python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9.10
+      #----------------------------------------------
+      #       set up environment
+      #----------------------------------------------
+      - name: Load cached Poetry installation
+        uses: actions/cache@v2
+        with:
+          path: ~/.local
+          key: poetry-0 
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          version: 1.8.2
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+          installer-parallel: true
+      - name: Load cached venv
+        id: cached-poetry-dependencies
+        uses: actions/cache@v2
+        with:
+          path: .venv
+          key: venv-typehint-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-00
+      - name: Install dependencies
+        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
+        run: poetry install --no-interaction --no-root --with=typechecking
+      - name: Install library
+        run: poetry install --no-interaction
+      #----------------------------------------------
+      #              run mypy
+      #----------------------------------------------
+      - name: Run mypy
+        run: |
+          source .venv/bin/activate
+          mypy splink
+
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,7 +56,7 @@ rapidfuzz = ">=2.0.3"
 [tool.poetry.group.typechecking]
 optional = true
 [tool.poetry.group.typechecking.dependencies]
-mypy = "1.7.0"
+mypy = "1.9.0"
 
 [tool.poetry.extras]
 pyspark = ["pyspark"]
@@ -112,16 +112,17 @@ markers = [
 packages = "splink"
 # temporary exclusions
 exclude = [
-    # modules getting substantial rewrites:
-    '.*comparison_imports\.py$',
-    '.*comparison.*library\.py',
-    'comparison_level_composition',
     # modules with large number of errors
+    '.*comparison.*library\.py',
     '.*linker\.py',
+    '/settings_validation/'
 ]
 # for now at least allow implicit optionals
 # to cut down on noise. Easy to fix.
 implicit_optional = true
 # for now, ignore missing imports
 # can remove later and install stubs, where existent
-ignore_missing_imports = true
+ignore_missing_imports = true
+# don't follow imports to modules we don't want to typecheck yet
+# eventually restore this back to the default "normal"
+follow_imports = "silent"
diff --git a/splink/accuracy.py b/splink/accuracy.py
@@ -151,10 +151,12 @@ def _select_found_by_blocking_rules(linker: "Linker"):
     brs = linker._settings_obj._blocking_rules_to_generate_predictions
 
     if brs:
-        brs = [move_l_r_table_prefix_to_column_suffix(b.blocking_rule_sql) for b in brs]
-        brs = [f"(coalesce({b}, false))" for b in brs]
-        brs = " OR ".join(brs)
-        br_col = f" ({brs}) "
+        br_strings = [
+            move_l_r_table_prefix_to_column_suffix(b.blocking_rule_sql) for b in brs
+        ]
+        wrapped_br_strings = [f"(coalesce({b}, false))" for b in br_strings]
+        full_br_string = " OR ".join(wrapped_br_strings)
+        br_col = f" ({full_br_string}) "
     else:
         br_col = " 1=1 "
 

diff --git a/splink/blocking.py b/splink/blocking.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 
 from sqlglot import parse_one
 from sqlglot.expressions import Column, Join
 from sqlglot.optimizer.eliminate_joins import join_condition
 
+from .exceptions import SplinkException
 from .input_column import InputColumn
 from .misc import ensure_is_list
 from .splink_dataframe import SplinkDataFrame
@@ -19,7 +20,7 @@
     from .linker import Linker
 
 
-def blocking_rule_to_obj(br) -> BlockingRule:
+def blocking_rule_to_obj(br: BlockingRule | dict | str) -> BlockingRule:
     if isinstance(br, BlockingRule):
         return br
     elif isinstance(br, dict):
@@ -262,9 +263,15 @@ def __init__(
         sqlglot_dialect: str = None,
         array_columns_to_explode: list = [],
     ):
-        super().__init__(blocking_rule, sqlglot_dialect)
+        if isinstance(blocking_rule, BlockingRule):
+            blocking_rule_sql = blocking_rule.blocking_rule_sql
+        elif isinstance(blocking_rule, dict):
+            blocking_rule_sql = blocking_rule["blocking_rule_sql"]
+        else:
+            blocking_rule_sql = blocking_rule
+        super().__init__(blocking_rule_sql, sqlglot_dialect)
         self.array_columns_to_explode: List[str] = array_columns_to_explode
-        self.exploded_id_pair_table: SplinkDataFrame = None
+        self.exploded_id_pair_table: Optional[SplinkDataFrame] = None
 
     def marginal_exploded_id_pairs_table_sql(self, linker: Linker, br: BlockingRule):
         """generates a table of the marginal id pairs from the exploded blocking rule
@@ -325,7 +332,12 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker):
         unique_id_input_columns = (
             linker._settings_obj.column_info_settings.unique_id_input_columns
         )
-        splink_df = self.exploded_id_pair_table
+        if (splink_df := self.exploded_id_pair_table) is None:
+            raise SplinkException(
+                "Must use `materialise_exploded_id_table(linker)` "
+                "to set `exploded_id_pair_table` before calling "
+                "exclude_pairs_generated_by_this_rule_sql()."
+            )
         ids_to_compare_sql = f"select * from {splink_df.physical_name}"
 
         id_expr_l = _composite_unique_id_from_nodes_sql(unique_id_input_columns, "l")

diff --git a/splink/blocking_rule_library.py b/splink/blocking_rule_library.py
@@ -75,6 +75,8 @@ def create_sql(self, sql_dialect: SplinkDialect) -> str:
 
 
 class _Merge(BlockingRuleCreator):
+    _clause = ""
+
     @final
     def __init__(
         self,
@@ -90,7 +92,10 @@ def __init__(
             raise ValueError(
                 f"Must provide at least one blocking rule to {type(self)}()"
             )
-        self.blocking_rules = blocking_rules
+        blocking_rule_creators = [
+            CustomRule(**br) if isinstance(br, dict) else br for br in blocking_rules
+        ]
+        self.blocking_rules = blocking_rule_creators
 
     @property
     def salting_partitions(self):
@@ -165,7 +170,7 @@ def block_on(
         )
 
     if len(col_names_or_exprs) == 1:
-        br = ExactMatchRule(col_names_or_exprs[0])
+        br: BlockingRuleCreator = ExactMatchRule(col_names_or_exprs[0])
     else:
         br = And(*[ExactMatchRule(c) for c in col_names_or_exprs])
 

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
@@ -3,7 +3,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 from jinja2 import Template
 
@@ -42,7 +42,7 @@ def _clusters_sql(df_clustered_nodes, cluster_ids: list) -> str:
 
 def df_clusters_as_records(
     linker: "Linker", df_clustered_nodes: SplinkDataFrame, cluster_ids: list
-):
+) -> list[dict]:
     """Retrieves distinct clusters which exist in df_clustered_nodes based on
     list of cluster IDs provided and converts them to a record dictionary.
 
@@ -86,7 +86,7 @@ def _nodes_sql(df_clustered_nodes, cluster_ids) -> str:
 
 def create_df_nodes(
     linker: "Linker", df_clustered_nodes: SplinkDataFrame, cluster_ids: list
-):
+) -> SplinkDataFrame:
     """Retrieves nodes from df_clustered_nodes for list of cluster IDs provided.
 
     Args:
@@ -150,7 +150,7 @@ def df_edges_as_records(
 
 def _get_random_cluster_ids(
     linker: "Linker", connected_components: SplinkDataFrame, sample_size: int, seed=None
-):
+) -> list[str]:
     sql = f"""
     select count(distinct cluster_id) as count
     from {connected_components.physical_name}
@@ -189,7 +189,7 @@ def _get_random_cluster_ids(
 
 def _get_cluster_id_of_each_size(
     linker: "Linker", connected_components: SplinkDataFrame, rows_per_partition: int
-):
+) -> list[dict]:
     unique_id_col_name = linker._settings_obj.column_info_settings.unique_id_column_name
     sql = f"""
     select
@@ -233,7 +233,7 @@ def _get_lowest_density_clusters(
     df_cluster_metrics: SplinkDataFrame,
     rows_per_partition: int,
     min_nodes: int,
-):
+) -> list[dict]:
     """Returns lowest density clusters of different sizes by
     performing stratified sampling.
 
@@ -277,6 +277,55 @@ def _get_lowest_density_clusters(
     return df_lowest_density_clusters.as_record_dict()
 
 
+def _get_cluster_ids(
+    linker: "Linker",
+    df_clustered_nodes: SplinkDataFrame,
+    sampling_method,
+    sample_size,
+    sample_seed,
+    _df_cluster_metrics: Optional[SplinkDataFrame] = None,
+) -> tuple[list, list]:
+    if sampling_method == "random":
+        cluster_ids = _get_random_cluster_ids(
+            linker, df_clustered_nodes, sample_size, sample_seed
+        )
+        cluster_names = []
+    elif sampling_method == "by_cluster_size":
+        cluster_id_infos = _get_cluster_id_of_each_size(
+            linker, df_clustered_nodes, rows_per_partition=1
+        )
+        if len(cluster_id_infos) > sample_size:
+            cluster_id_infos = random.sample(cluster_id_infos, k=sample_size)
+        cluster_names = [
+            f"Cluster ID: {c['cluster_id']}, size:  {c['cluster_size']}"
+            for c in cluster_id_infos
+        ]
+        cluster_ids = [c["cluster_id"] for c in cluster_id_infos]
+    elif sampling_method == "lowest_density_clusters_by_size":
+        if _df_cluster_metrics is None:
+            raise SplinkException(
+                """To sample by density, you must provide a cluster metrics table
+                    containing density. This can be generated by calling the
+                    _compute_graph_metrics method on the linker."""
+            )
+        # Using sensible default for min_nodes. Might become option
+        # for users in future
+        cluster_id_infos = _get_lowest_density_clusters(
+            linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3
+        )
+        if len(cluster_id_infos) > sample_size:
+            cluster_id_infos = random.sample(cluster_id_infos, k=sample_size)
+        cluster_names = [
+            f"""Cluster ID: {c['cluster_id']}, density (4dp): {c['density_4dp']},
+            size: {c['cluster_size']}"""
+            for c in cluster_id_infos
+        ]
+        cluster_ids = [c["cluster_id"] for c in cluster_id_infos]
+    else:
+        raise ValueError(f"Unknown sampling method {sampling_method}")
+    return cluster_ids, cluster_names
+
+
 def render_splink_cluster_studio_html(
     linker: "Linker",
     df_predicted_edges: SplinkDataFrame,
@@ -285,7 +334,7 @@ def render_splink_cluster_studio_html(
     sampling_method="random",
     sample_size=10,
     sample_seed=None,
-    cluster_ids: list = None,
+    cluster_ids: list[str] = None,
     cluster_names: list = None,
     overwrite: bool = False,
     _df_cluster_metrics: SplinkDataFrame = None,
@@ -296,43 +345,15 @@ def render_splink_cluster_studio_html(
         "cluster_colname": "cluster_id",
         "prob_colname": "match_probability",
     }
-    named_clusters_dict = None
     if cluster_ids is None:
-        if sampling_method == "random":
-            cluster_ids = _get_random_cluster_ids(
-                linker, df_clustered_nodes, sample_size, sample_seed
-            )
-        if sampling_method == "by_cluster_size":
-            cluster_ids = _get_cluster_id_of_each_size(linker, df_clustered_nodes, 1)
-            if len(cluster_ids) > sample_size:
-                cluster_ids = random.sample(cluster_ids, k=sample_size)
-            cluster_names = [
-                f"Cluster ID: {c['cluster_id']}, size:  {c['cluster_size']}"
-                for c in cluster_ids
-            ]
-            cluster_ids = [c["cluster_id"] for c in cluster_ids]
-            named_clusters_dict = dict(zip(cluster_ids, cluster_names))
-        if sampling_method == "lowest_density_clusters_by_size":
-            if _df_cluster_metrics is None:
-                raise SplinkException(
-                    """To sample by density, you must provide a cluster metrics table
-                      containing density. This can be generated by calling the
-                      _compute_graph_metrics method on the linker."""
-                )
-            # Using sensible default for min_nodes. Might become option
-            # for users in future
-            cluster_ids = _get_lowest_density_clusters(
-                linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3
-            )
-            if len(cluster_ids) > sample_size:
-                cluster_ids = random.sample(cluster_ids, k=sample_size)
-            cluster_names = [
-                f"""Cluster ID: {c['cluster_id']}, density (4dp): {c['density_4dp']},
-                size: {c['cluster_size']}"""
-                for c in cluster_ids
-            ]
-            cluster_ids = [c["cluster_id"] for c in cluster_ids]
-            named_clusters_dict = dict(zip(cluster_ids, cluster_names))
+        cluster_ids, cluster_names = _get_cluster_ids(
+            linker,
+            df_clustered_nodes,
+            sampling_method,
+            sample_size,
+            sample_seed,
+            _df_cluster_metrics,
+        )
 
     cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids)
     df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids)
@@ -356,7 +377,6 @@ def render_splink_cluster_studio_html(
     if cluster_names:
         named_clusters_dict = dict(zip(cluster_ids, cluster_names))
 
-    if named_clusters_dict:
         template_data["named_clusters"] = json.dumps(
             named_clusters_dict, cls=EverythingEncoder
         )

diff --git a/splink/column_expression.py b/splink/column_expression.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
 import re
 import string
 from copy import copy
 from functools import partial
-from typing import Union
+from typing import Callable, Union
 
 import sqlglot
 
@@ -38,7 +40,7 @@ class ColumnExpression:
 
     def __init__(self, sql_expression: str, sql_dialect: SplinkDialect = None):
         self.raw_sql_expression = sql_expression
-        self.operations = []
+        self.operations: list[Callable] = []
         if sql_dialect is not None:
             self.sql_dialect: SplinkDialect = sql_dialect