From 6c0c1b72dcd65cdfc51ef48e0f85d1cbdb9c6b88 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 14:24:17 +0100 Subject: [PATCH 01/59] refactor blocking to not need linker --- splink/analyse_blocking.py | 623 ++++++++++++------ splink/blocking.py | 223 ++++--- .../blocking_rule_generated_comparisons.json | 19 +- ..._with_comparison_counts_below_threshold.py | 6 +- splink/linker.py | 334 +++------- splink/misc.py | 16 - splink/settings.py | 19 + 7 files changed, 681 insertions(+), 559 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 12b91a57e2..00629ec35d 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -1,190 +1,112 @@ from __future__ import annotations -from copy import deepcopy -from typing import TYPE_CHECKING, Sequence, TypedDict, Union +import logging +from typing import Iterable, List, Literal, Union import pandas as pd - -from .blocking import BlockingRule, _sql_gen_where_condition, block_using_rules_sqls -from .misc import calculate_cartesian, calculate_reduction_ratio +import sqlglot + +from .blocking import ( + BlockingRule, + _sql_gen_where_condition, + block_using_rules_sqls, + materialise_exploded_id_tables, +) +from .blocking_rule_creator import BlockingRuleCreator +from .blocking_rule_creator_utils import to_blocking_rule_creator +from .charts import cumulative_blocking_rule_comparisons_generated +from .database_api import DatabaseAPI, DatabaseAPISubClass +from .input_column import InputColumn +from .misc import calculate_cartesian from .pipeline import CTEPipeline -from .vertically_concatenate import compute_df_concat, enqueue_df_concat - -# https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports -if TYPE_CHECKING: - from .linker import Linker - - -def number_of_comparisons_generated_by_blocking_rule_post_filters_sql( - linker: Linker, - blocking_rule: str, -) -> str: - settings_obj = linker._settings_obj - - where_condition = _sql_gen_where_condition( - settings_obj._link_type, - settings_obj.column_info_settings.unique_id_input_columns, - ) - - sql = f""" - select count(*) as count_of_pairwise_comparisons_generated - - from __splink__df_concat as l - inner join __splink__df_concat as r - on - {blocking_rule} - {where_condition} - """ +from .splink_dataframe import SplinkDataFrame +from .vertically_concatenate import ( + split_df_concat_with_tf_into_two_tables_sqls, + vertically_concatenate_sql, +) - return sql +logger = logging.getLogger(__name__) -class CumulativeComparisonsDict(TypedDict): - row_count: int - rule: str - cumulative_rows: int - cartesian: int - reduction_ratio: str - start: int +link_type_type = Literal["link_only", "link_and_dedupe", "dedupe_only"] -def cumulative_comparisons_generated_by_blocking_rules( - linker: Linker, - blocking_rules: Sequence[str | BlockingRule], - output_chart: bool = True, - return_dataframe: bool = False, -) -> pd.DataFrame | list[CumulativeComparisonsDict]: - # Deepcopy our original linker so we can safely adjust our settings. - # This is particularly important to ensure we don't overwrite our - # original blocking rules. - linker = deepcopy(linker) - - settings_obj = linker._settings_obj - linker._settings_obj = settings_obj +def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( + input_data_dict: dict[str, "SplinkDataFrame"], + blocking_rule: Union[str, "BlockingRule"], + link_type: str, + db_api: DatabaseAPISubClass, + unique_id_column_name: str, +) -> str: + input_dataframes = list(input_data_dict.values()) - if blocking_rules: - brs_as_objs = settings_obj._brs_as_objs(blocking_rules) + if len(input_dataframes) > 1: + unique_id_cols = [ + InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), + InputColumn("source_dataset", sql_dialect=db_api.sql_dialect.name), + ] else: - brs_as_objs = linker._settings_obj._blocking_rules_to_generate_predictions + unique_id_cols = [ + InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), + ] + where_condition = _sql_gen_where_condition(link_type, unique_id_cols) - # Turn tf off. No need to apply term frequencies to perform these calcs - settings_obj._retain_matching_columns = False - settings_obj._retain_intermediate_calculation_columns = False - for cc in settings_obj.comparisons: - for cl in cc.comparison_levels: - # TODO: ComparisonLevel: manage access - cl._tf_adjustment_column = None - - pipeline = CTEPipeline() - concat = compute_df_concat(linker, pipeline) - - # Calculate the Cartesian Product - if output_chart: - # We only need the cartesian product if we want to output the chart view - - if settings_obj._link_type == "dedupe_only": - group_by_statement = "" - else: - group_by_statement = "group by source_dataset" - - pipeline = CTEPipeline([concat]) - - sql = f""" - select count(*) as count - from {concat.physical_name} - {group_by_statement} - """ - - pipeline.enqueue_sql(sql, "__splink__cartesian_product") - cartesian_count = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline) - row_count_df = cartesian_count.as_record_dict() - cartesian_count.drop_table_from_database_and_remove_from_cache() + sqls = [] - cartesian = calculate_cartesian(row_count_df, settings_obj._link_type) + two_dataset_link_only = link_type == "link_only" and len(input_dataframes) == 2 - # Calculate the total number of rows generated by each blocking rule + if two_dataset_link_only: + input_tablename_l = input_dataframes[0].physical_name + input_tablename_r = input_dataframes[1].physical_name + else: + sql = vertically_concatenate_sql( + input_data_dict, salting_required=False, source_dataset_column_name=None + ) + sqls.append({"sql": sql, "output_table_name": "__splink__df_concat"}) - # Note two dataset link only is not currently supported - link_type = settings_obj._link_type + input_tablename_l = "__splink__df_concat" + input_tablename_r = "__splink__df_concat" - pipeline = CTEPipeline([concat]) - sql_infos = block_using_rules_sqls( - linker, - input_tablename_l="__splink__df_concat", - input_tablename_r="__splink__df_concat", - blocking_rules=brs_as_objs, - link_type=link_type, - ) - pipeline.enqueue_list_of_sqls(sql_infos) + sql = f""" + select count(*) as count_of_pairwise_comparisons_generated - sql = """ - select - count(*) as row_count, - match_key - from __splink__df_blocked - group by match_key - order by cast(match_key as int) asc + from {input_tablename_l} as l + inner join {input_tablename_r} as r + on + {blocking_rule.blocking_rule_sql} + {where_condition} """ - pipeline.enqueue_sql(sql, "__splink__df_count_cumulative_blocks") + sqls.append({"sql": sql, "output_table_name": "__splink__comparions_post_filter"}) + return sqls - cumulative_blocking_rule_count = linker.db_api.sql_pipeline_to_splink_dataframe( - pipeline - ) - br_n = cumulative_blocking_rule_count.as_pandas_dataframe() - # not all dialects return column names when frame is empty (e.g. sqlite, postgres) - if br_n.empty: - br_n["row_count"] = [] - br_n["match_key"] = [] - cumulative_blocking_rule_count.drop_table_from_database_and_remove_from_cache() - br_count, br_keys = list(br_n["row_count"]), list(br_n["match_key"].astype("int")) - - if len(br_count) != len(brs_as_objs): - missing_br = [x for x in range(len(brs_as_objs)) if x not in br_keys] - for n in missing_br: - br_count.insert(n, 0) - - br_comparisons = [] - cumulative_sum = 0 - # Wrap everything into an output dictionary - for row, br in zip(br_count, brs_as_objs): - out_dict = { - "row_count": row, - "rule": br.blocking_rule_sql, - } - if output_chart: - cumulative_sum += row - # Increase round threshold to capture more info on larger datasets - rr = round(calculate_reduction_ratio(cumulative_sum, cartesian), 6) - - rr_text = ( - "The rolling reduction ratio with your given blocking rule(s) " - f"is {rr}. This represents the reduction in the total number " - "of comparisons due to your rule(s)." - ) - additional_vals = { - "cumulative_rows": cumulative_sum, - "cartesian": int(cartesian), - "reduction_ratio": rr_text, - "start": cumulative_sum - row, - } - out_dict = {**out_dict, **additional_vals} +def _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( + input_data_dict: dict[str, "SplinkDataFrame"], + blocking_rule: Union[str, "BlockingRule"], + link_type: str, + db_api: DatabaseAPISubClass, +): + input_dataframes = list(input_data_dict.values()) - br_comparisons.append(out_dict.copy()) + if isinstance(blocking_rule, str): + blocking_rule = BlockingRule(blocking_rule, sqlglot_dialect=db_api.sql_dialect) - if return_dataframe: - return pd.DataFrame(br_comparisons) - else: - return br_comparisons + join_conditions = blocking_rule._equi_join_conditions + two_dataset_link_only = link_type == "link_only" and len(input_dataframes) == 2 + sqls = [] -def count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( - linker: "Linker", blocking_rule: Union[str, "BlockingRule"] -) -> list[dict[str, str]]: - if isinstance(blocking_rule, str): - blocking_rule = BlockingRule(blocking_rule, sqlglot_dialect=linker._sql_dialect) + if two_dataset_link_only: + input_tablename_l = input_dataframes[0].physical_name + input_tablename_r = input_dataframes[1].physical_name + else: + sql = vertically_concatenate_sql( + input_data_dict, salting_required=False, source_dataset_column_name=None + ) + sqls.append({"sql": sql, "output_table_name": "__splink__df_concat"}) - join_conditions = blocking_rule._equi_join_conditions + input_tablename_l = "__splink__df_concat" + input_tablename_r = "__splink__df_concat" l_cols_sel = [] r_cols_sel = [] @@ -207,20 +129,8 @@ def count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( r_cols_gb_str = ", ".join(r_cols_gb) using_str = ", ".join(using) - sqls = [] - - if linker._two_dataset_link_only: - # Can just use the raw input datasets - keys = list(linker._input_tables_dict.keys()) - input_tablename_l = linker._input_tables_dict[keys[0]].physical_name - input_tablename_r = linker._input_tables_dict[keys[1]].physical_name - - else: - input_tablename_l = "__splink__df_concat" - input_tablename_r = "__splink__df_concat" - if not join_conditions: - if linker._two_dataset_link_only: + if two_dataset_link_only: sql = f""" SELECT (SELECT COUNT(*) FROM {input_tablename_l}) @@ -269,7 +179,7 @@ def count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( sqls.append({"sql": sql, "output_table_name": "__splink__block_counts"}) sql = """ - select sum(block_count) as count_of_pairwise_comparisons_generated + select cast(sum(block_count) as integer) as count_of_pairwise_comparisons_generated from __splink__block_counts """ @@ -278,17 +188,362 @@ def count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( return sqls -def count_comparisons_from_blocking_rule_pre_filter_conditions( - linker: "Linker", blocking_rule: Union[str, "BlockingRule"] -) -> int: +def _row_counts_per_input_table( + splink_df_dict: dict[str, "SplinkDataFrame"], + link_type: link_type_type, + source_dataset_column_name: str, + db_api: DatabaseAPI, +): pipeline = CTEPipeline() - pipeline = enqueue_df_concat(linker, pipeline) - sqls = count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( - linker, blocking_rule + sql = vertically_concatenate_sql( + splink_df_dict, + salting_required=False, + source_dataset_column_name=source_dataset_column_name, ) + pipeline.enqueue_sql(sql, "__splink__df_concat") + + if link_type == "dedupe_only": + sql = """ + select count(*) as count + from __splink__df_concat + """ + else: + sql = f""" + select count(*) as count + from __splink__df_concat + group by {source_dataset_column_name} + """ + pipeline.enqueue_sql(sql, "__splink__df_count") + return db_api.sql_pipeline_to_splink_dataframe(pipeline) + + +def _cumulative_comparisons_to_be_scored_from_blocking_rules( + *, + splink_df_dict: dict[str, "SplinkDataFrame"], + blocking_rules: Iterable[BlockingRule], + link_type: link_type_type, + db_api: DatabaseAPI, + max_rows_limit: int = 1e9, + unique_id_column_name: str, + source_dataset_column_name: str = None, +) -> pd.DataFrame: + unique_id_input_column = InputColumn( + unique_id_column_name, sql_dialect=db_api.sql_dialect.name + ) + if link_type == "dedupe_only": + source_dataset_input_column = None + input_columns = [unique_id_input_column] + else: + source_dataset_input_column = InputColumn( + source_dataset_column_name, sql_dialect=db_api.sql_dialect.name + ) + input_columns = [unique_id_input_column, source_dataset_input_column] + + # Check none of the blocking rules will create a vast/computationally + # intractable number of comparisons + for br in blocking_rules: + # TODO: Deal properly with exlpoding rules + count = _count_comparisons_generated_from_blocking_rule( + splink_df_dict=splink_df_dict, + blocking_rule=br, + link_type=link_type, + db_api=db_api, + max_rows_limit=max_rows_limit, + compute_post_filter_count=False, + unique_id_column_name=unique_id_column_name, + ) + count_pre_filter = count[ + "number_of_comparisons_generated_pre_filter_conditions" + ] + + if count_pre_filter > max_rows_limit: + # TODO: Use a SplinkException? Want this to give a sensible message + # when ocoming from estimate_probability_two_random_records_match + raise ValueError( + f"Blocking rule {br.blocking_rule_sql} would create {count_pre_filter} " + "comparisonns.\nThis exceeds the max_rows_limit of " + f"{max_rows_limit}.\nPlease tighten the " + "blocking rule or increase the max_rows_limit." + ) + + rc = _row_counts_per_input_table( + splink_df_dict, + link_type, + source_dataset_column_name, + db_api, + ).as_record_dict() + + cartesian_count = calculate_cartesian(rc, link_type) + + for n, br in enumerate(blocking_rules): + br.add_preceding_rules(blocking_rules[:n]) + + exploding_br_with_id_tables = materialise_exploded_id_tables( + link_type, + blocking_rules, + db_api, + splink_df_dict, + source_dataset_input_column=source_dataset_input_column, + unique_id_input_column=unique_id_input_column, + ) + + pipeline = CTEPipeline() + + sql = vertically_concatenate_sql( + splink_df_dict, + salting_required=False, + source_dataset_column_name=source_dataset_column_name, + ) + + pipeline.enqueue_sql(sql, "__splink__df_concat") + + sql_select_expr = ",".join( + [item for c in input_columns for item in c.l_r_names_as_l_r] + ) + + blocking_input_tablename_l = "__splink__df_concat" + blocking_input_tablename_r = "__splink__df_concat" + if len(splink_df_dict) == 2 and link_type == "link_only": + sqls = split_df_concat_with_tf_into_two_tables_sqls( + "__splink__df_concat", + source_dataset_column_name, + ) + pipeline.enqueue_list_of_sqls(sqls) + + blocking_input_tablename_l = "__splink__df_concat_left" + blocking_input_tablename_r = "__splink__df_concat_right" + link_type = "two_dataset_link_only" + + sqls = block_using_rules_sqls( + input_tablename_l=blocking_input_tablename_l, + input_tablename_r=blocking_input_tablename_r, + blocking_rules=blocking_rules, + link_type="dedupe_only", + set_match_probability_to_one=True, + unique_id_input_column=unique_id_input_column, + source_dataset_input_column=source_dataset_input_column, + columns_to_select_sql=sql_select_expr, + ) + pipeline.enqueue_list_of_sqls(sqls) - df_res = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline) - res = df_res.as_record_dict()[0] - return int(res["count_of_pairwise_comparisons_generated"]) + sql = """ + select + count(*) as row_count, + match_key + from __splink__df_blocked + group by match_key + order by cast(match_key as int) asc + """ + pipeline.enqueue_sql(sql, "__splink__df_count_cumulative_blocks") + + sql = f""" + SELECT + row_count, + match_key, + cast(SUM(row_count) OVER (ORDER BY match_key) as int) AS cumulative_rows, + cast(SUM(row_count) OVER (ORDER BY match_key) - row_count as int) AS start, + cast({cartesian_count} as int) as cartesian + + FROM + __splink__df_count_cumulative_blocks + """ + + pipeline.enqueue_sql(sql, "__splink__df_count_cumulative_blocks_2") + + records = db_api.sql_pipeline_to_splink_dataframe(pipeline).as_record_dict() + + # Lookup table match_key -> blocking_rule + rules = {i: r.blocking_rule_sql for i, r in enumerate(blocking_rules)} + + for r in records: + r["blocking_rule"] = rules[int(r["match_key"])] + + [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables] + + col_order = [ + "blocking_rule", + "row_count", + "cumulative_rows", + "cartesian", + "match_key", + "start", + ] + return pd.DataFrame(records)[col_order] + + +def _count_comparisons_generated_from_blocking_rule( + *, + splink_df_dict: dict[str, "SplinkDataFrame"], + blocking_rule: BlockingRule, + link_type: link_type_type, + db_api: DatabaseAPI, + compute_post_filter_count: bool = False, + max_rows_limit: int = 1e9, + unique_id_column_name: str = "unique_id", +): + # TODO: if it's an exploding blocking rule, make sure we error out + pipeline = CTEPipeline() + sqls = _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( + splink_df_dict, blocking_rule, link_type, db_api + ) + pipeline.enqueue_list_of_sqls(sqls) + pre_filter_total_df = db_api.sql_pipeline_to_splink_dataframe(pipeline) + + pre_filter_total = pre_filter_total_df.as_record_dict()[0][ + "count_of_pairwise_comparisons_generated" + ] + pre_filter_total_df.drop_table_from_database_and_remove_from_cache() + + def add_l_r(sql, table_name): + tree = sqlglot.parse_one(sql, dialect=db_api.sql_dialect.sqlglot_name) + for node in tree.find_all(sqlglot.expressions.Column): + node.set("table", table_name) + return tree.sql(dialect=db_api.sql_dialect.sqlglot_name) + + equi_join_conditions = [ + add_l_r(i, "l") + " = " + add_l_r(j, "r") + for i, j in blocking_rule._equi_join_conditions + ] + + equi_join_conditions = " AND ".join(equi_join_conditions) + + filter_conditions = blocking_rule._filter_conditions + if filter_conditions == "TRUE": + filter_conditions = "" + + if not compute_post_filter_count: + return { + "number_of_comparisons_generated_pre_filter_conditions": pre_filter_total, + "number_of_comparisons_to_be_scored_post_filter_conditions": "not computed", + "filter_conditions_identified": filter_conditions, + "equi_join_conditions_identified": equi_join_conditions, + } + + if pre_filter_total < max_rows_limit: + pipeline = CTEPipeline() + sqls = _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( + splink_df_dict, blocking_rule, link_type, db_api, unique_id_column_name + ) + pipeline.enqueue_list_of_sqls(sqls) + post_filter_total_df = db_api.sql_pipeline_to_splink_dataframe(pipeline) + post_filter_total = post_filter_total_df.as_record_dict()[0][ + "count_of_pairwise_comparisons_generated" + ] + post_filter_total_df.drop_table_from_database_and_remove_from_cache() + else: + post_filter_total = "exceeded max_rows_limit, see warning" + + logger.warning( + "WARNING:\nComputation of number of comparisons post-filter conditions was " + f"skipped because the number of comparisons generated by your " + f"blocking rule exceeded max_rows_limit={max_rows_limit:.2e}." + "\nIt would be likely to be slow to compute.\nIf you still want to go ahead" + " increase the value of max_rows_limit argument to above " + f"{pre_filter_total:.3e}.\nRead more about the definitions here:\n" + "https://moj-analytical-services.github.io/splink/topic_guides/blocking/performance.html?h=filter+cond#filter-conditions" + ) + + return { + "number_of_comparisons_generated_pre_filter_conditions": pre_filter_total, + "number_of_comparisons_to_be_scored_post_filter_conditions": post_filter_total, + "filter_conditions_identified": filter_conditions, + "equi_join_conditions_identified": equi_join_conditions, + } + + +def count_comparisons_from_blocking_rule( + *, + table_or_tables, + blocking_rule: Union[BlockingRuleCreator, str, dict], + link_type: link_type_type, + db_api: DatabaseAPI, + unique_id_column_name: str, + compute_post_filter_count: bool = False, + max_rows_limit: int = 1e9, +): + if not isinstance(blocking_rule, BlockingRule): + blocking_rule = to_blocking_rule_creator(blocking_rule).get_blocking_rule( + db_api.sql_dialect.name + ) + + splink_df_dict = db_api.register_multiple_tables(table_or_tables) + + return _count_comparisons_generated_from_blocking_rule( + splink_df_dict=splink_df_dict, + blocking_rule=blocking_rule, + link_type=link_type, + db_api=db_api, + compute_post_filter_count=compute_post_filter_count, + max_rows_limit=max_rows_limit, + unique_id_column_name=unique_id_column_name, + ) + + +def cumulative_comparisons_to_be_scored_from_blocking_rules_data( + *, + table_or_tables, + blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], + link_type: link_type_type, + db_api: DatabaseAPI, + max_rows_limit: int = 1e9, + unique_id_column_name: str, + source_dataset_column_name: str = None, +): + splink_df_dict = db_api.register_multiple_tables(table_or_tables) + + blocking_rules: List[BlockingRule] = [] + for br in blocking_rule_creators: + if isinstance(br, BlockingRule): + blocking_rules.append(br) + else: + blocking_rules.append( + to_blocking_rule_creator(br).get_blocking_rule(db_api.sql_dialect.name) + ) + + return _cumulative_comparisons_to_be_scored_from_blocking_rules( + splink_df_dict=splink_df_dict, + blocking_rules=blocking_rules, + link_type=link_type, + db_api=db_api, + max_rows_limit=max_rows_limit, + unique_id_column_name=unique_id_column_name, + source_dataset_column_name=source_dataset_column_name, + ) + + +def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( + *, + table_or_tables, + blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], + link_type: link_type_type, + db_api: DatabaseAPI, + max_rows_limit: int = 1e9, + unique_id_column_name: str, + source_dataset_column_name: str = None, +): + splink_df_dict = db_api.register_multiple_tables(table_or_tables) + + blocking_rules: List[BlockingRule] = [] + for br in blocking_rule_creators: + if isinstance(br, BlockingRule): + blocking_rules.append(br) + else: + blocking_rules.append( + to_blocking_rule_creator(br).get_blocking_rule(db_api.sql_dialect.name) + ) + + pd_df = _cumulative_comparisons_to_be_scored_from_blocking_rules( + splink_df_dict=splink_df_dict, + blocking_rules=blocking_rules, + link_type=link_type, + db_api=db_api, + max_rows_limit=max_rows_limit, + unique_id_column_name=unique_id_column_name, + source_dataset_column_name=source_dataset_column_name, + ) + + return cumulative_blocking_rule_comparisons_generated( + pd_df.to_dict(orient="records") + ) diff --git a/splink/blocking.py b/splink/blocking.py index 108662393a..406cedf1dd 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -1,19 +1,21 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, Any, List, Optional, Tuple from sqlglot import parse_one -from sqlglot.expressions import Column, Expression, Join +from sqlglot.expressions import Column, Expression, Identifier, Join from sqlglot.optimizer.eliminate_joins import join_condition +from sqlglot.optimizer.optimizer import optimize +from .database_api import DatabaseAPI from .exceptions import SplinkException from .input_column import InputColumn from .misc import ensure_is_list from .pipeline import CTEPipeline from .splink_dataframe import SplinkDataFrame from .unique_id_concat import _composite_unique_id_from_nodes_sql -from .vertically_concatenate import compute_df_concat_with_tf +from .vertically_concatenate import vertically_concatenate_sql logger = logging.getLogger(__name__) @@ -58,6 +60,16 @@ def blocking_rule_to_obj(br: BlockingRule | dict[str, Any] | str) -> BlockingRul return br +def combine_unique_id_input_columns( + source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn +) -> Tuple[InputColumn]: + unique_id_input_columns = () + if source_dataset_input_column: + unique_id_input_columns += (source_dataset_input_column,) + unique_id_input_columns += (unique_id_input_column,) + return unique_id_input_columns + + class BlockingRule: def __init__( self, @@ -88,7 +100,11 @@ def add_preceding_rules(self, rules): rules = ensure_is_list(rules) self.preceding_rules = rules - def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker) -> str: + def exclude_pairs_generated_by_this_rule_sql( + self, + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, + ): """A SQL string specifying how to exclude the results of THIS blocking rule from subseqent blocking statements, so that subsequent statements do not produce duplicate pairs @@ -99,14 +115,21 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker) -> str: # meaning these comparisons get lost return f"coalesce(({self.blocking_rule_sql}),false)" - def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker) -> str: + def exclude_pairs_generated_by_all_preceding_rules_sql( + self, + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, + ): """A SQL string that excludes the results of ALL previous blocking rules from the pairwise comparisons generated. """ if not self.preceding_rules: return "" or_clauses = [ - br.exclude_pairs_generated_by_this_rule_sql(linker) + br.exclude_pairs_generated_by_this_rule_sql( + source_dataset_input_column, + unique_id_input_column, + ) for br in self.preceding_rules ] previous_rules = " OR ".join(or_clauses) @@ -114,16 +137,15 @@ def exclude_pairs_generated_by_all_preceding_rules_sql(self, linker: Linker) -> def create_blocked_pairs_sql( self, - linker: Linker, + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, *, - input_tablename_l: str, - input_tablename_r: str, - where_condition: str, - probability: str, - ) -> str: - columns_to_select = linker._settings_obj._columns_to_select_for_blocking - sql_select_expr = ", ".join(columns_to_select) - + input_tablename_l, + input_tablename_r, + where_condition, + probability, + sql_select_expr, + ): sql = f""" select {sql_select_expr} @@ -134,7 +156,7 @@ def create_blocked_pairs_sql( on ({self.blocking_rule_sql}) {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} + {self.exclude_pairs_generated_by_all_preceding_rules_sql(source_dataset_input_column, unique_id_input_column)} """ return sql @@ -190,6 +212,10 @@ def _filter_conditions(self): if not filter_condition: return "" else: + filter_condition = optimize(filter_condition) + for i in filter_condition.find_all(Identifier): + i.set("quoted", False) + return filter_condition.sql(self.sqlglot_dialect) def as_dict(self): @@ -247,17 +273,19 @@ def _salting_condition(self, salt): def create_blocked_pairs_sql( self, - linker: Linker, *, - input_tablename_l: str, - input_tablename_r: str, - where_condition: str, - probability: str, - ) -> str: - columns_to_select = linker._settings_obj._columns_to_select_for_blocking - sql_select_expr = ", ".join(columns_to_select) - + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, + input_tablename_l, + input_tablename_r, + where_condition, + probability, + sql_select_expr, + ): sqls = [] + exclude_sql = self.exclude_pairs_generated_by_all_preceding_rules_sql( + source_dataset_input_column, unique_id_input_column + ) for salt in range(self.salting_partitions): salt_condition = self._salting_condition(salt) sql = f""" @@ -270,13 +298,19 @@ def create_blocked_pairs_sql( on ({self.blocking_rule_sql} {salt_condition}) {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} + {exclude_sql} """ sqls.append(sql) return " UNION ALL ".join(sqls) +def _explode_arrays_sql(db_api, tbl_name, columns_to_explode, other_columns_to_retain): + return db_api.sql_dialect.explode_arrays_sql( + tbl_name, columns_to_explode, other_columns_to_retain + ) + + class ExplodingBlockingRule(BlockingRule): def __init__( self, @@ -295,17 +329,20 @@ def __init__( self.exploded_id_pair_table: Optional[SplinkDataFrame] = None def marginal_exploded_id_pairs_table_sql( - self, linker: Linker, br: BlockingRule, link_type: "LinkTypeLiteralType" - ) -> str: + self, + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, + br: BlockingRule, + link_type: "LinkTypeLiteralType", + ): """generates a table of the marginal id pairs from the exploded blocking rule i.e. pairs are only created that match this blocking rule and NOT any of the preceding blocking rules """ - settings_obj = linker._settings_obj - unique_id_col = settings_obj.column_info_settings.unique_id_column_name - unique_id_input_columns = ( - settings_obj.column_info_settings.unique_id_input_columns + unique_id_col = unique_id_input_column + unique_id_input_columns = combine_unique_id_input_columns( + source_dataset_input_column, unique_id_input_column ) where_condition = _sql_gen_where_condition(link_type, unique_id_input_columns) @@ -318,15 +355,18 @@ def marginal_exploded_id_pairs_table_sql( where_condition + " and l.source_dataset < r.source_dataset" ) + exclude_sql = self.exclude_pairs_generated_by_all_preceding_rules_sql( + source_dataset_input_column, unique_id_input_column + ) sql = f""" select distinct - {id_expr_l} as {unique_id_col}_l, - {id_expr_r} as {unique_id_col}_r - from __splink__df_concat_with_tf_unnested as l - inner join __splink__df_concat_with_tf_unnested as r + {id_expr_l} as {unique_id_col.name_l}, + {id_expr_r} as {unique_id_col.name_r} + from __splink__df_concat_unnested as l + inner join __splink__df_concat_unnested as r on ({br.blocking_rule_sql}) {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(linker)} + {exclude_sql} """ return sql @@ -336,18 +376,22 @@ def drop_materialised_id_pairs_dataframe(self): self.exploded_id_pair_table.drop_table_from_database_and_remove_from_cache() self.exploded_id_pair_table = None - def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker) -> str: + def exclude_pairs_generated_by_this_rule_sql( + self, + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, + ): """A SQL string specifying how to exclude the results of THIS blocking rule from subseqent blocking statements, so that subsequent statements do not produce duplicate pairs """ - unique_id_column = ( - linker._settings_obj.column_info_settings.unique_id_column_name - ) - unique_id_input_columns = ( - linker._settings_obj.column_info_settings.unique_id_input_columns + unique_id_column = unique_id_input_column + + unique_id_input_columns = combine_unique_id_input_columns( + source_dataset_input_column, unique_id_input_column ) + if (splink_df := self.exploded_id_pair_table) is None: raise SplinkException( "Must use `materialise_exploded_id_table(linker)` " @@ -362,34 +406,34 @@ def exclude_pairs_generated_by_this_rule_sql(self, linker: Linker) -> str: return f"""EXISTS ( select 1 from ({ids_to_compare_sql}) as ids_to_compare where ( - {id_expr_l} = ids_to_compare.{unique_id_column}_l and - {id_expr_r} = ids_to_compare.{unique_id_column}_r + {id_expr_l} = ids_to_compare.{unique_id_column.name_l} and + {id_expr_r} = ids_to_compare.{unique_id_column.name_r} ) ) """ def create_blocked_pairs_sql( self, - linker: Linker, *, - input_tablename_l: str, - input_tablename_r: str, - where_condition: str, - probability: str, - ) -> str: - columns_to_select = linker._settings_obj._columns_to_select_for_blocking - sql_select_expr = ", ".join(columns_to_select) - + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, + input_tablename_l, + input_tablename_r, + where_condition, + probability, + sql_select_expr, + ): if self.exploded_id_pair_table is None: raise ValueError( "Exploding blocking rules are not supported for the function you have" " called." ) - settings_obj = linker._settings_obj - unique_id_col = settings_obj.column_info_settings.unique_id_column_name - unique_id_input_columns = ( - settings_obj.column_info_settings.unique_id_input_columns + + unique_id_col = unique_id_input_column + unique_id_input_columns = combine_unique_id_input_columns( + source_dataset_input_column, unique_id_input_column ) + id_expr_l = _composite_unique_id_from_nodes_sql(unique_id_input_columns, "l") id_expr_r = _composite_unique_id_from_nodes_sql(unique_id_input_columns, "r") @@ -401,9 +445,9 @@ def create_blocked_pairs_sql( {probability} from {exploded_id_pair_table.physical_name} as pairs left join {input_tablename_l} as l - on pairs.{unique_id_col}_l={id_expr_l} + on pairs.{unique_id_col.name_l}={id_expr_l} left join {input_tablename_r} as r - on pairs.{unique_id_col}_r={id_expr_r} + on pairs.{unique_id_col.name_r}={id_expr_r} """ return sql @@ -414,48 +458,61 @@ def as_dict(self): def materialise_exploded_id_tables( - linker: Linker, link_type: "LinkTypeLiteralType" -) -> list[ExplodingBlockingRule]: - settings_obj = linker._settings_obj - - blocking_rules = settings_obj._blocking_rules_to_generate_predictions + link_type: "LinkTypeLiteralType", + blocking_rules: List[BlockingRule], + db_api: DatabaseAPI, + splink_df_dict: dict[str, SplinkDataFrame], + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, +): exploding_blocking_rules = [ br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) ] + if len(exploding_blocking_rules) == 0: return [] exploded_tables = [] pipeline = CTEPipeline() - nodes_with_tf = compute_df_concat_with_tf(linker, pipeline) - input_colnames = {col.name for col in nodes_with_tf.columns} + sql = vertically_concatenate_sql( + splink_df_dict, + salting_required=False, + source_dataset_column_name=source_dataset_input_column, + ) + pipeline.enqueue_sql(sql, "__splink__df_concat") + nodes_concat = db_api.sql_pipeline_to_splink_dataframe(pipeline) + + input_colnames = {col.name for col in nodes_concat.columns} for br in exploding_blocking_rules: - pipeline = CTEPipeline([nodes_with_tf]) + pipeline = CTEPipeline([nodes_concat]) arrays_to_explode_quoted = [ - InputColumn(colname, sql_dialect=linker._sql_dialect).quote().name + InputColumn(colname, sql_dialect=db_api.sql_dialect.name).quote().name for colname in br.array_columns_to_explode ] - expl_sql = linker._explode_arrays_sql( - "__splink__df_concat_with_tf", + + expl_sql = db_api.sql_dialect.explode_arrays_sql( + "__splink__df_concat", br.array_columns_to_explode, list(input_colnames.difference(arrays_to_explode_quoted)), ) pipeline.enqueue_sql( expl_sql, - "__splink__df_concat_with_tf_unnested", + "__splink__df_concat_unnested", ) base_name = "__splink__marginal_exploded_ids_blocking_rule" table_name = f"{base_name}_mk_{br.match_key}" - sql = br.marginal_exploded_id_pairs_table_sql(linker, br, link_type) + sql = br.marginal_exploded_id_pairs_table_sql( + source_dataset_input_column, unique_id_input_column, br, link_type + ) pipeline.enqueue_sql(sql, table_name) - marginal_ids_table = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline) + marginal_ids_table = db_api.sql_pipeline_to_splink_dataframe(pipeline) br.exploded_id_pair_table = marginal_ids_table exploded_tables.append(marginal_ids_table) @@ -481,14 +538,16 @@ def _sql_gen_where_condition(link_type, unique_id_cols): def block_using_rules_sqls( - linker: Linker, *, input_tablename_l: str, input_tablename_r: str, blocking_rules: List[BlockingRule], link_type: "LinkTypeLiteralType", + columns_to_select_sql: str, + source_dataset_input_column: InputColumn, + unique_id_input_column: InputColumn, set_match_probability_to_one: bool = False, -) -> list[dict[str, str]]: +): """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions according to the blocking rule(s). @@ -499,12 +558,12 @@ def block_using_rules_sqls( sqls = [] - settings_obj = linker._settings_obj - - where_condition = _sql_gen_where_condition( - link_type, settings_obj.column_info_settings.unique_id_input_columns + unique_id_input_columns = combine_unique_id_input_columns( + source_dataset_input_column, unique_id_input_column ) + where_condition = _sql_gen_where_condition(link_type, unique_id_input_columns) + # Cover the case where there are no blocking rules # This is a bit of a hack where if you do a self-join on 'true' # you create a cartesian product, rather than having separate code @@ -523,11 +582,13 @@ def block_using_rules_sqls( for br in blocking_rules: sql = br.create_blocked_pairs_sql( - linker, + unique_id_input_column=unique_id_input_column, + source_dataset_input_column=source_dataset_input_column, input_tablename_l=input_tablename_l, input_tablename_r=input_tablename_r, where_condition=where_condition, probability=probability, + sql_select_expr=columns_to_select_sql, ) br_sqls.append(sql) diff --git a/splink/files/chart_defs/blocking_rule_generated_comparisons.json b/splink/files/chart_defs/blocking_rule_generated_comparisons.json index ba95b8b16a..3150010344 100644 --- a/splink/files/chart_defs/blocking_rule_generated_comparisons.json +++ b/splink/files/chart_defs/blocking_rule_generated_comparisons.json @@ -23,21 +23,15 @@ "field": "cumulative_rows" }, "y": { - "field": "rule", + "field": "blocking_rule", "title": "SQL Blocking Rule", "sort": ["-x2"] }, - "color": { - "field": "rule", - "legend": null, - "scale": {"scheme": "category20c"} - }, "order": {"field": "cumulative_rows"}, - "tooltip": [ { "type": "nominal", - "field": "rule", + "field": "blocking_rule", "title": "SQL Condition" }, { @@ -55,14 +49,9 @@ { "type": "quantitative", "field": "cartesian", - "title": "Cartesian Product of Input Data", + "title": "Total comparisons in Cartesian product", "format": "," - }, - { - "type": "nominal", - "field": "reduction_ratio", - "title": "Reduction Ratio (cumulative rows/cartesian product)" } ] } -} +} \ No newline at end of file diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 24a5eba9e7..6ac7043143 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -7,7 +7,7 @@ import pandas as pd from .analyse_blocking import ( - count_comparisons_from_blocking_rule_pre_filter_conditions, + count_comparisons_from_blocking_rule, ) from .blocking import BlockingRule from .blocking_rule_creator import BlockingRuleCreator @@ -158,9 +158,7 @@ def _search_tree_for_blocking_rules_below_threshold_count( br = _generate_blocking_rule(linker, current_combination) - comparison_count = count_comparisons_from_blocking_rule_pre_filter_conditions( - linker, br - ) + comparison_count = count_comparisons_from_blocking_rule(linker, br) already_visited.add(frozenset(current_combination)) diff --git a/splink/linker.py b/splink/linker.py index 298634f88f..e7f445fd6e 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -14,12 +14,7 @@ truth_space_table_from_labels_column, truth_space_table_from_labels_table, ) -from .analyse_blocking import ( - CumulativeComparisonsDict, - count_comparisons_from_blocking_rule_pre_filter_conditions, - cumulative_comparisons_generated_by_blocking_rules, - number_of_comparisons_generated_by_blocking_rule_post_filters_sql, -) +from .analyse_blocking import _cumulative_comparisons_to_be_scored_from_blocking_rules from .blocking import ( BlockingRule, SaltedBlockingRule, @@ -28,15 +23,11 @@ materialise_exploded_id_tables, ) from .blocking_rule_creator import BlockingRuleCreator -from .blocking_rule_creator_utils import ( - blocking_rule_args_to_list_of_blocking_rules, - to_blocking_rule_creator, -) +from .blocking_rule_creator_utils import to_blocking_rule_creator from .cache_dict_with_logging import CacheDictWithLogging from .charts import ( ChartReturnType, accuracy_chart, - cumulative_blocking_rule_comparisons_generated, match_weights_histogram, parameter_estimate_comparisons, precision_recall_chart, @@ -78,9 +69,6 @@ ) from .m_from_labels import estimate_m_from_pairwise_labels from .m_training import estimate_m_values_from_label_column -from .match_key_analysis import ( - count_num_comparisons_from_blocking_rules_for_prediction_sql, -) from .match_weights_histogram import histogram_data from .misc import ( ascii_uid, @@ -123,7 +111,6 @@ enqueue_df_concat, enqueue_df_concat_with_tf, split_df_concat_with_tf_into_two_tables_sqls, - vertically_concatenate_sql, ) logger = logging.getLogger(__name__) @@ -747,15 +734,45 @@ def deterministic_link(self) -> SplinkDataFrame: df_concat_with_tf = compute_df_concat_with_tf(self, pipeline) pipeline = CTEPipeline([df_concat_with_tf]) link_type = self._settings_obj._link_type - exploding_br_with_id_tables = materialise_exploded_id_tables(self, link_type) + + blocking_input_tablename_l = "__splink__df_concat_with_tf" + blocking_input_tablename_r = "__splink__df_concat_with_tf" + + link_type = self._settings_obj._link_type + if ( + len(self._input_tables_dict) == 2 + and self._settings_obj._link_type == "link_only" + ): + sqls = split_df_concat_with_tf_into_two_tables_sqls( + "__splink__df_concat_with_tf", + self._settings_obj.column_info_settings.source_dataset_column_name, + ) + pipeline.enqueue_list_of_sqls(sqls) + + blocking_input_tablename_l = "__splink__df_concat_with_tf_left" + blocking_input_tablename_r = "__splink__df_concat_with_tf_right" + link_type = "two_dataset_link_only" + + exploding_br_with_id_tables = materialise_exploded_id_tables( + link_type=link_type, + blocking_rules=self._settings_obj._blocking_rules_to_generate_predictions, + db_api=self.db_api, + splink_df_dict=self._input_tables_dict, + source_dataset_input_column=self._settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=self._settings_obj.column_info_settings.unique_id_input_column, + ) + + columns_to_select = self._settings_obj._columns_to_select_for_blocking + sql_select_expr = ", ".join(columns_to_select) sqls = block_using_rules_sqls( - self, - input_tablename_l="__splink__df_concat_with_tf", - input_tablename_r="__splink__df_concat_with_tf", + input_tablename_l=blocking_input_tablename_l, + input_tablename_r=blocking_input_tablename_r, blocking_rules=self._settings_obj._blocking_rules_to_generate_predictions, link_type=link_type, - set_match_probability_to_one=True, + columns_to_select_sql=sql_select_expr, + source_dataset_input_column=self._settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=self._settings_obj.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) @@ -1035,7 +1052,7 @@ def predict( self, threshold_match_probability: float = None, threshold_match_weight: float = None, - materialise_after_computing_term_frequencies: bool = True, + materialise_after_computing_term_frequencies=True, ) -> SplinkDataFrame: """Create a dataframe of scored pairwise comparisons using the parameters of the linkage model. @@ -1109,14 +1126,27 @@ def predict( # If exploded blocking rules exist, we need to materialise # the tables of ID pairs - exploding_br_with_id_tables = materialise_exploded_id_tables(self, link_type) + + exploding_br_with_id_tables = materialise_exploded_id_tables( + link_type=link_type, + blocking_rules=self._settings_obj._blocking_rules_to_generate_predictions, + db_api=self.db_api, + splink_df_dict=self._input_tables_dict, + source_dataset_input_column=self._settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=self._settings_obj.column_info_settings.unique_id_input_column, + ) + + columns_to_select = self._settings_obj._columns_to_select_for_blocking + sql_select_expr = ", ".join(columns_to_select) sqls = block_using_rules_sqls( - self, input_tablename_l=blocking_input_tablename_l, input_tablename_r=blocking_input_tablename_r, blocking_rules=self._settings_obj._blocking_rules_to_generate_predictions, link_type=link_type, + columns_to_select_sql=sql_select_expr, + source_dataset_input_column=self._settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=self._settings_obj.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) @@ -2561,230 +2591,6 @@ def parameter_estimate_comparisons_chart( return parameter_estimate_comparisons(records) - def count_num_comparisons_from_blocking_rule( - self, - blocking_rule: str | BlockingRuleCreator, - ) -> int: - """Compute the number of pairwise record comparisons that would be generated by - a blocking rule - - Args: - blocking_rule (str | BlockingRule): The blocking rule to analyse - link_type (str, optional): The link type. This is needed only if the - linker has not yet been provided with a settings dictionary. Defaults - to None. - unique_id_column_name (str, optional): This is needed only if the - linker has not yet been provided with a settings dictionary. Defaults - to None. - - Examples: - - ```py - br = block_on("name", "substr(dob,1,4)") - linker.count_num_comparisons_from_blocking_rule(br) - ``` - > 394 - - ```py - br = "l.surname = r.surname" - linker.count_num_comparisons_from_blocking_rule(br) - ``` - > 19387 - - Returns: - int: The number of comparisons generated by the blocking rule - """ - - blocking_rule_str = ( - to_blocking_rule_creator(blocking_rule) - .get_blocking_rule(self._sql_dialect) - .blocking_rule_sql - ) - - pipeline = CTEPipeline() - - sds_name = self._settings_obj.column_info_settings.source_dataset_column_name - - sql = vertically_concatenate_sql( - input_tables=self._input_tables_dict, - salting_required=self._settings_obj.salting_required, - source_dataset_column_name=sds_name, - ) - pipeline.enqueue_sql(sql, "__splink__df_concat") - - sql = number_of_comparisons_generated_by_blocking_rule_post_filters_sql( - self, blocking_rule_str - ) - pipeline.enqueue_sql(sql, "__splink__analyse_blocking_rule") - res_df = self.db_api.sql_pipeline_to_splink_dataframe(pipeline) - res = res_df.as_record_dict()[0] - return res["count_of_pairwise_comparisons_generated"] - - def _count_num_comparisons_from_blocking_rule_pre_filter_conditions( - self, - blocking_rule: BlockingRuleCreator | str | dict[str, Any], - ) -> int: - """Compute the number of pairwise record comparisons that would be generated by - a blocking rule, prior to any filters (non equi-join conditions) being applied - by the SQL engine. - - For more information on what this means, see - https://github.com/moj-analytical-services/splink/discussions/1391 - - Args: - blocking_rule (str): The blocking rule to analyse - - Returns: - int: The number of comparisons generated by the blocking rule - """ - - blocking_rule_obj = to_blocking_rule_creator(blocking_rule).get_blocking_rule( - self._sql_dialect - ) - return count_comparisons_from_blocking_rule_pre_filter_conditions( - self, blocking_rule_obj - ) - - def cumulative_comparisons_from_blocking_rules_records( - self, - blocking_rules: Optional[List[Union[str, BlockingRuleCreator]]] | None = None, - ) -> list[CumulativeComparisonsDict]: - """Output the number of comparisons generated by each successive blocking rule. - - This is equivalent to the output size of df_predict and details how many - comparisons each of your individual blocking rules will contribute to the - total. - - Args: - blocking_rules (str or list): The blocking rule(s) to compute comparisons - for. If null, the rules set out in your settings object will be used. - - Examples: - Generate total comparisons from Blocking Rules defined in settings - dictionary - ```py - linker_settings = DuckDBLinker(df, settings) - # Compute the cumulative number of comparisons generated by the rules - # in your settings object. - linker_settings.cumulative_comparisons_from_blocking_rules_records() - ``` - - Generate total comparisons with custom blocking rules. - ```py - blocking_rules = [ - "l.surname = r.surname", - "l.first_name = r.first_name - and substr(l.dob,1,4) = substr(r.dob,1,4)" - ] - - linker_settings.cumulative_comparisons_from_blocking_rules_records( - blocking_rules - ) - ``` - - Returns: - List: A list of blocking rules and the corresponding number of - comparisons it is forecast to generate. - """ - if blocking_rules: - brs = blocking_rule_args_to_list_of_blocking_rules( - blocking_rules, self._sql_dialect - ) - else: - brs = [] - - records = cumulative_comparisons_generated_by_blocking_rules( - self, brs, output_chart=False - ) - - return records - - def cumulative_num_comparisons_from_blocking_rules_chart( - self, - blocking_rules: Optional[List[Union[str, BlockingRuleCreator]]] = None, - ) -> ChartReturnType: - """Display a chart with the cumulative number of comparisons generated by a - selection of blocking rules. - - This is equivalent to the output size of df_predict and details how many - comparisons each of your individual blocking rules will contribute to the - total. - - Args: - blocking_rules (str or list): The blocking rule(s) to compute comparisons - for. If null, the rules set out in your settings object will be used. - - Examples: - ```py - linker_settings = DuckDBLinker(df, settings) - # Compute the cumulative number of comparisons generated by the rules - # in your settings object. - linker_settings.cumulative_num_comparisons_from_blocking_rules_chart() - >>> - # Generate total comparisons with custom blocking rules. - blocking_rules = [ - "l.surname = r.surname", - "l.first_name = r.first_name - and substr(l.dob,1,4) = substr(r.dob,1,4)" - ] - >>> - linker_settings.cumulative_num_comparisons_from_blocking_rules_chart( - blocking_rules - ) - ``` - - Returns: - altair.Chart: An altair chart - """ - - if blocking_rules: - brs = blocking_rule_args_to_list_of_blocking_rules( - blocking_rules, self._sql_dialect - ) - else: - brs = [] - - records = cumulative_comparisons_generated_by_blocking_rules( - self, brs, output_chart=True - ) - - return cumulative_blocking_rule_comparisons_generated(records) - - def count_num_comparisons_from_blocking_rules_for_prediction(self, df_predict): - """Counts the marginal number of edges created from each of the blocking rules - in `blocking_rules_to_generate_predictions` - - This is different to `count_num_comparisons_from_blocking_rule` - because it (a) analyses multiple blocking rules rather than a single rule, and - (b) deduplicates any comparisons that are generated, to tell you the - marginal effect of each entry in `blocking_rules_to_generate_predictions` - - Args: - df_predict (SplinkDataFrame): SplinkDataFrame with match weights - and probabilities of rows matching - - Examples: - ```py - linker = DuckDBLinker(df) - linker.load_model("settings.json") - df_predict = linker.predict(threshold_match_probability=0.95) - count_pairwise = linker.count_num_comparisons_from_blocking_rules_for_prediction(df_predict) - count_pairwise.as_pandas_dataframe(limit=5) - ``` - - Returns: - SplinkDataFrame: A SplinkDataFrame of the pairwise comparisons and - estimated pairwise comparisons generated by the blocking rules. - """ # noqa: E501 - sql = count_num_comparisons_from_blocking_rules_for_prediction_sql( - self, df_predict - ) - pipeline = CTEPipeline() - pipeline.enqueue_sql(sql, "__splink__match_key_analysis") - match_key_analysis = self.db_api.sql_pipeline_to_splink_dataframe(pipeline) - - return match_key_analysis - def match_weights_chart(self): """Display a chart of the (partial) match weights of the linkage model @@ -2992,7 +2798,8 @@ def estimate_probability_two_random_records_match( self, deterministic_matching_rules: List[Union[str, BlockingRuleCreator]], recall: float, - ) -> None: + max_rows_limit: int = 1e9, + ): """Estimate the model parameter `probability_two_random_records_match` using a direct estimation approach. @@ -3012,13 +2819,29 @@ def estimate_probability_two_random_records_match( raise ValueError( f"Estimated recall must be greater than 0 " f"and no more than 1. Supplied value {recall}." - ) + ) from None + blocking_rules: List[BlockingRule] = [] + for br in deterministic_matching_rules: + if isinstance(br, BlockingRule): + blocking_rules.append(br) + else: + blocking_rules.append( + to_blocking_rule_creator(br).get_blocking_rule( + self.db_api.sql_dialect.name + ) + ) - rules = blocking_rule_args_to_list_of_blocking_rules( - deterministic_matching_rules, self._sql_dialect + pd_df = _cumulative_comparisons_to_be_scored_from_blocking_rules( + splink_df_dict=self._input_tables_dict, + blocking_rules=blocking_rules, + link_type=self._settings_obj._link_type, + db_api=self.db_api, + max_rows_limit=max_rows_limit, + unique_id_column_name=self._settings_obj.column_info_settings.unique_id_column_name, + source_dataset_column_name=self._settings_obj.column_info_settings.source_dataset_column_name, ) - records = cumulative_comparisons_generated_by_blocking_rules(self, rules) + records = pd_df.to_dict(orient="records") summary_record = records[-1] num_observed_matches = summary_record["cumulative_rows"] @@ -3369,10 +3192,3 @@ def _detect_blocking_rules_for_em_training( "suggested_blocking_rules_as_splink_brs" ].iloc[0] return suggestion - - def _explode_arrays_sql( - self, tbl_name, columns_to_explode, other_columns_to_retain - ): - return self._sql_dialect_object.explode_arrays_sql( - tbl_name, columns_to_explode, other_columns_to_retain - ) diff --git a/splink/misc.py b/splink/misc.py index df2f7920cf..e62828f023 100644 --- a/splink/misc.py +++ b/splink/misc.py @@ -131,22 +131,6 @@ def calculate_cartesian(df_rows, link_type): ) -def calculate_reduction_ratio(N, cartesian): - """ - Args: - N (int): The number of record pairs generated by a - blocking rule. - cartesian (int): The cartesian product of your input - dataframe(s). - - Generates the reduction ratio. This represents the % reduction - in the comparison space as a result of using your given blocking - rule. This is a measure of how much the Blocking Rule reduces - the total search space. - """ - return 1 - (N / cartesian) - - def major_minor_version_greater_equal_than(this_version, base_comparison_version): this_version = this_version.split(".")[:2] this_version = [v.zfill(10) for v in this_version] diff --git a/splink/settings.py b/splink/settings.py index 177962390d..d41828af69 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -39,6 +39,25 @@ def source_dataset_column_name(self): else: return None + @property + def source_dataset_input_column(self): + if self._source_dataset_column_name_is_required: + return InputColumn( + self._source_dataset_column_name, + column_info_settings=self, + sql_dialect=self.sql_dialect, + ) + else: + return None + + @property + def unique_id_input_column(self): + return InputColumn( + self.unique_id_column_name, + column_info_settings=self, + sql_dialect=self.sql_dialect, + ) + @property def unique_id_input_columns(self) -> list[InputColumn]: cols = [] From c6fdb0a763c76ab00ef956857b2995bb8396d211 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 15:52:23 +0100 Subject: [PATCH 02/59] refactor estimate u to use new block_using_rules_sqls --- splink/estimate_u.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/splink/estimate_u.py b/splink/estimate_u.py index 7f38469ba7..5086259e22 100644 --- a/splink/estimate_u.py +++ b/splink/estimate_u.py @@ -5,7 +5,7 @@ from copy import deepcopy from typing import TYPE_CHECKING, List -from .blocking import BlockingRule, block_using_rules_sqls, blocking_rule_to_obj +from .blocking import block_using_rules_sqls, blocking_rule_to_obj from .comparison_vector_values import compute_comparison_vector_values_sql from .expectation_maximisation import ( compute_new_parameters_sql, @@ -17,7 +17,7 @@ ) from .pipeline import CTEPipeline from .vertically_concatenate import ( - compute_df_concat_with_tf, + enqueue_df_concat, split_df_concat_with_tf_into_two_tables_sqls, ) @@ -61,8 +61,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non logger.info("----- Estimating u probabilities using random sampling -----") pipeline = CTEPipeline() - nodes_with_tf = compute_df_concat_with_tf(linker, pipeline) - pipeline = CTEPipeline([nodes_with_tf]) + pipeline = enqueue_df_concat(linker, pipeline) original_settings_obj = linker._settings_obj @@ -82,7 +81,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non if settings_obj._link_type in ["dedupe_only", "link_and_dedupe"]: sql = """ select count(*) as count - from __splink__df_concat_with_tf + from __splink__df_concat """ pipeline.enqueue_sql(sql, "__splink__df_concat_count") @@ -97,7 +96,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non if settings_obj._link_type == "link_only": sql = """ select count(source_dataset) as count - from __splink__df_concat_with_tf + from __splink__df_concat group by source_dataset """ pipeline.enqueue_sql(sql, "__splink__df_concat_count") @@ -118,20 +117,18 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non if sample_size > total_nodes: sample_size = total_nodes - # Grab __splink__df_concat_with_tf from cache - df_tf = training_linker._intermediate_table_cache.get_with_logging( - "__splink__df_concat_with_tf" - ) - pipeline = CTEPipeline(input_dataframes=[df_tf]) + pipeline = CTEPipeline() + pipeline = enqueue_df_concat(training_linker, pipeline) sql = f""" select * - from __splink__df_concat_with_tf + from __splink__df_concat {training_linker._random_sample_sql(proportion, sample_size, seed)} """ - pipeline.enqueue_sql(sql, "__splink__df_concat_with_tf_sample") + pipeline.enqueue_sql(sql, "__splink__df_concat_sample") df_sample = db_api.sql_pipeline_to_splink_dataframe(pipeline) + pipeline = CTEPipeline(input_dataframes=[df_sample]) if linker._sql_dialect == "duckdb" and max_pairs > 1e4: @@ -145,28 +142,31 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non else: settings_obj._blocking_rules_to_generate_predictions = [] - input_tablename_sample_l = "__splink__df_concat_with_tf_sample" - input_tablename_sample_r = "__splink__df_concat_with_tf_sample" + input_tablename_sample_l = "__splink__df_concat_sample" + input_tablename_sample_r = "__splink__df_concat_sample" if ( len(linker._input_tables_dict) == 2 and linker._settings_obj._link_type == "link_only" ): - input_tablename_sample_l = "__splink__df_concat_with_tf_sample_left" - input_tablename_sample_r = "__splink__df_concat_with_tf_sample_right" sqls = split_df_concat_with_tf_into_two_tables_sqls( - "__splink__df_concat_with_tf_sample", + "__splink__df_concat", linker._settings_obj.column_info_settings.source_dataset_column_name, sample_switch=True, ) + input_tablename_sample_l = "__splink__df_concat_sample_left" + input_tablename_sample_r = "__splink__df_concat_sample_right" + pipeline.enqueue_list_of_sqls(sqls) sql_infos = block_using_rules_sqls( - linker, input_tablename_l=input_tablename_sample_l, input_tablename_r=input_tablename_sample_r, - blocking_rules=[BlockingRule("1=1")], + blocking_rules=settings_obj._blocking_rules_to_generate_predictions, link_type=linker._settings_obj._link_type, + columns_to_select_sql=", ".join(settings_obj._columns_to_select_for_blocking), + source_dataset_input_column=settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=settings_obj.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sql_infos) From a38033c5aca4888008880ef6b94a5fb9159035c1 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 15:52:58 +0100 Subject: [PATCH 03/59] refactor estimate u to use new block_using_rules_sqls --- splink/vertically_concatenate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/splink/vertically_concatenate.py b/splink/vertically_concatenate.py index 276238d64e..ab145266ed 100644 --- a/splink/vertically_concatenate.py +++ b/splink/vertically_concatenate.py @@ -191,7 +191,7 @@ def split_df_concat_with_tf_into_two_tables_sqls( sample_text = "_sample" if sample_switch else "" sql = f""" - select * from __splink__df_concat_with_tf{sample_text} + select * from {input_tablename}{sample_text} where {source_dataset_col} = (select min({source_dataset_col}) from {input_tablename}) """ @@ -199,19 +199,19 @@ def split_df_concat_with_tf_into_two_tables_sqls( sqls.append( { "sql": sql, - "output_table_name": f"__splink__df_concat_with_tf{sample_text}_left", + "output_table_name": f"{input_tablename}{sample_text}_left", } ) sql = f""" - select * from __splink__df_concat_with_tf{sample_text} + select * from {input_tablename}{sample_text} where {source_dataset_col} = (select max({source_dataset_col}) from {input_tablename}) """ sqls.append( { "sql": sql, - "output_table_name": f"__splink__df_concat_with_tf{sample_text}_right", + "output_table_name": f"{input_tablename}{sample_text}_right", } ) return sqls From 7eebdcc8fd157463f16a070330ed91889cefb1dd Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 16:39:02 +0100 Subject: [PATCH 04/59] test fixes --- splink/em_training_session.py | 9 +++++++-- tests/test_caching_tables.py | 20 +++++++++++--------- tests/test_charts.py | 4 ++-- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/splink/em_training_session.py b/splink/em_training_session.py index dfdfa127de..240a37b8d6 100644 --- a/splink/em_training_session.py +++ b/splink/em_training_session.py @@ -183,12 +183,17 @@ def _comparison_vectors(self) -> SplinkDataFrame: nodes_with_tf = compute_df_concat_with_tf(self._original_linker, pipeline) pipeline = CTEPipeline([nodes_with_tf]) + orig_settings = self._original_linker._settings_obj sqls = block_using_rules_sqls( - self._original_linker, input_tablename_l="__splink__df_concat_with_tf", input_tablename_r="__splink__df_concat_with_tf", blocking_rules=[self._blocking_rule_for_training], - link_type=self._original_linker._settings_obj._link_type, + link_type=orig_settings._link_type, + columns_to_select_sql=", ".join( + orig_settings._columns_to_select_for_blocking + ), + source_dataset_input_column=orig_settings.column_info_settings.source_dataset_input_column, + unique_id_input_column=orig_settings.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) diff --git a/tests/test_caching_tables.py b/tests/test_caching_tables.py index 99aa25960a..5d96459156 100644 --- a/tests/test_caching_tables.py +++ b/tests/test_caching_tables.py @@ -22,7 +22,7 @@ def test_cache_tracking_works(): settings = { "link_type": "dedupe_only", - "comparisons": [LevenshteinAtThresholds("name", 2)], + "comparisons": [ExactMatch("name").configure(term_frequency_adjustments=True)], "blocking_rules_to_generate_predictions": ["l.name = r.name"], } @@ -32,11 +32,11 @@ def test_cache_tracking_works(): cache = linker._intermediate_table_cache assert cache.is_in_executed_queries("__splink__df_concat_with_tf") is False - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert cache.is_in_executed_queries("__splink__df_concat_with_tf") is True - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert ( cache.is_in_queries_retrieved_from_cache("__splink__df_concat_with_tf") is True ) @@ -48,7 +48,7 @@ def test_cache_tracking_works(): assert ( cache.is_in_queries_retrieved_from_cache("__splink__df_concat_with_tf") is False ) - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert cache.is_in_executed_queries("__splink__df_concat_with_tf") is False assert ( cache.is_in_queries_retrieved_from_cache("__splink__df_concat_with_tf") is True @@ -57,7 +57,9 @@ def test_cache_tracking_works(): linker.invalidate_cache() cache.reset_executed_queries_tracker() cache.reset_queries_retrieved_from_cache_tracker() - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() + # Triggers adding to queries retrieved from cache + linker.predict() assert cache.is_in_executed_queries("__splink__df_concat_with_tf") is True assert ( cache.is_in_queries_retrieved_from_cache("__splink__df_concat_with_tf") is True @@ -94,7 +96,7 @@ def test_cache_used_when_registering_nodes_table(): linker = Linker(df, settings, database_api=db_api) cache = linker._intermediate_table_cache linker.register_table_input_nodes_concat_with_tf(splink__df_concat_with_tf) - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert cache.is_in_executed_queries("__splink__df_concat_with_tf") is False assert ( cache.is_in_queries_retrieved_from_cache("__splink__df_concat_with_tf") is True @@ -145,7 +147,7 @@ def test_cache_used_when_registering_tf_tables(): linker = Linker(df, settings, database_api=db_api) cache = linker._intermediate_table_cache - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert not cache.is_in_queries_retrieved_from_cache("__splink__df_tf_first_name") assert not cache.is_in_queries_retrieved_from_cache("__splink__df_tf_surname") @@ -156,7 +158,7 @@ def test_cache_used_when_registering_tf_tables(): linker = Linker(df, settings, database_api=db_api) cache = linker._intermediate_table_cache linker.register_term_frequency_lookup(surname_tf_table, "surname") - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert not cache.is_in_queries_retrieved_from_cache("__splink__df_tf_first_name") assert cache.is_in_queries_retrieved_from_cache("__splink__df_tf_surname") @@ -168,7 +170,7 @@ def test_cache_used_when_registering_tf_tables(): cache = linker._intermediate_table_cache linker.register_term_frequency_lookup(surname_tf_table, "surname") linker.register_term_frequency_lookup(first_name_tf_table, "first_name") - linker.estimate_u_using_random_sampling(max_pairs=1e4) + linker.predict() assert cache.is_in_queries_retrieved_from_cache("__splink__df_tf_first_name") assert cache.is_in_queries_retrieved_from_cache("__splink__df_tf_surname") diff --git a/tests/test_charts.py b/tests/test_charts.py index f72d4401ca..5c200c3f46 100644 --- a/tests/test_charts.py +++ b/tests/test_charts.py @@ -133,7 +133,7 @@ def test_m_u_charts(): linker = Linker(df, settings, database_api=db_api) linker.estimate_probability_two_random_records_match( - "l.true_match_id = r.true_match_id", recall=1.0 + ["l.true_match_id = r.true_match_id"], recall=1.0 ) linker.estimate_parameters_using_expectation_maximisation( @@ -161,7 +161,7 @@ def test_parameter_estimate_charts(): linker = Linker(df, settings, database_api=db_api) linker.estimate_probability_two_random_records_match( - "l.true_match_id = r.true_match_id", recall=1.0 + ["l.true_match_id = r.true_match_id"], recall=1.0 ) linker.estimate_parameters_using_expectation_maximisation( From 9220f3ce33a48f976a54a45f3174ccb91ede0ae9 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 17:11:56 +0100 Subject: [PATCH 05/59] fix rr tests --- splink/analyse_blocking.py | 2 +- tests/test_estimate_prob_two_rr_match.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 00629ec35d..b600c88f40 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -319,7 +319,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( input_tablename_l=blocking_input_tablename_l, input_tablename_r=blocking_input_tablename_r, blocking_rules=blocking_rules, - link_type="dedupe_only", + link_type=link_type, set_match_probability_to_one=True, unique_id_input_column=unique_id_input_column, source_dataset_input_column=source_dataset_input_column, diff --git a/tests/test_estimate_prob_two_rr_match.py b/tests/test_estimate_prob_two_rr_match.py index 002e5d12ea..46c7537a46 100644 --- a/tests/test_estimate_prob_two_rr_match.py +++ b/tests/test_estimate_prob_two_rr_match.py @@ -353,7 +353,7 @@ def check_range(p): # all comparisons matches using this rule, so we must have perfect recall # using recall = 80% is inconsistent, so should get an error linker.estimate_probability_two_random_records_match( - "l.first_name = r.first_name", recall=0.8 + ["l.first_name = r.first_name"], recall=0.8 ) check_range(linker._settings_obj._probability_two_random_records_match) @@ -361,10 +361,10 @@ def check_range(p): recall_min_city = 6 / 15 with pytest.raises(ValueError): linker.estimate_probability_two_random_records_match( - "l.city = r.city", recall=(recall_min_city - 1e-6) + ["l.city = r.city"], recall=(recall_min_city - 1e-6) ) linker.estimate_probability_two_random_records_match( - "l.city = r.city", recall=recall_min_city + ["l.city = r.city"], recall=recall_min_city ) check_range(linker._settings_obj._probability_two_random_records_match) @@ -373,7 +373,7 @@ def check_range(p): # so should give a warning at this stage with caplog.at_level(logging.WARNING): linker.estimate_probability_two_random_records_match( - "l.surname = r.surname", recall=0.7 + ["l.surname = r.surname"], recall=0.7 ) assert "WARNING:" in caplog.text check_range(linker._settings_obj._probability_two_random_records_match) @@ -382,7 +382,7 @@ def check_range(p): # as we have a trivial linkage model with caplog.at_level(logging.WARNING): linker.estimate_probability_two_random_records_match( - "l.first_name = r.first_name", recall=1.0 + ["l.first_name = r.first_name"], recall=1.0 ) assert "WARNING:" in caplog.text check_range(linker._settings_obj._probability_two_random_records_match) @@ -390,13 +390,13 @@ def check_range(p): # check we get errors if we pass bogus values for recall with pytest.raises(ValueError): linker.estimate_probability_two_random_records_match( - "l.first_name = r.first_name", recall=0.0 + ["l.first_name = r.first_name"], recall=0.0 ) with pytest.raises(ValueError): linker.estimate_probability_two_random_records_match( - "l.first_name = r.first_name", recall=1.2 + ["l.first_name = r.first_name"], recall=1.2 ) with pytest.raises(ValueError): linker.estimate_probability_two_random_records_match( - "l.first_name = r.first_name", recall=-0.4 + ["l.first_name = r.first_name"], recall=-0.4 ) From 196b1fce9d8b1f7c0fbcc79ed0feed6878e94668 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 17:19:21 +0100 Subject: [PATCH 06/59] fix --- splink/analyse_blocking.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index b600c88f40..92adaada1b 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -354,6 +354,10 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( records = db_api.sql_pipeline_to_splink_dataframe(pipeline).as_record_dict() + if len(records) == 0: + # records = [{'row_count': 0, 'rule': 'l.surname = r.surname', 'cumulative_rows': 0, 'cartesian': cartesian_count, 'start': 0}] + raise ValueError("No comparisons generated by blocking rules") + # Lookup table match_key -> blocking_rule rules = {i: r.blocking_rule_sql for i, r in enumerate(blocking_rules)} From 3460266915aa1e923ded6ceead14e2875d0bdf23 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 17:25:39 +0100 Subject: [PATCH 07/59] deal with case of no matches --- splink/analyse_blocking.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 92adaada1b..0a0f32c848 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -354,10 +354,6 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( records = db_api.sql_pipeline_to_splink_dataframe(pipeline).as_record_dict() - if len(records) == 0: - # records = [{'row_count': 0, 'rule': 'l.surname = r.surname', 'cumulative_rows': 0, 'cartesian': cartesian_count, 'start': 0}] - raise ValueError("No comparisons generated by blocking rules") - # Lookup table match_key -> blocking_rule rules = {i: r.blocking_rule_sql for i, r in enumerate(blocking_rules)} @@ -374,7 +370,21 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( "match_key", "start", ] - return pd.DataFrame(records)[col_order] + if len(records) > 0: + return pd.DataFrame(records)[col_order] + else: + return pd.DataFrame( + [ + { + "blocking_rule": "No blocking rules", + "row_count": 0, + "cumulative_rows": 0, + "cartesian": cartesian_count, + "match_key": 0, + "start": 0, + } + ] + ) def _count_comparisons_generated_from_blocking_rule( From fe93bd44451a56845f622ffbfa9c4614d0e6e4fb Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 17:30:39 +0100 Subject: [PATCH 08/59] fix determinisic link test --- tests/test_full_example_deterministic_link.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tests/test_full_example_deterministic_link.py b/tests/test_full_example_deterministic_link.py index ce274fc03c..476921f2f1 100644 --- a/tests/test_full_example_deterministic_link.py +++ b/tests/test_full_example_deterministic_link.py @@ -2,6 +2,9 @@ import pandas as pd +from splink.analyse_blocking import ( + cumulative_comparisons_to_be_scored_from_blocking_rules_chart, +) from splink.linker import Linker from .decorator import mark_with_dialects_excluding @@ -13,22 +16,30 @@ def test_deterministic_link_full_example(dialect, tmp_path, test_helpers): df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") df = helper.convert_frame(df) + br_for_predict = [ + "l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob", + "l.surname = r.surname and l.dob = r.dob and l.email = r.email", + "l.first_name = r.first_name and l.surname = r.surname " + "and l.email = r.email", + ] settings = { "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob", - "l.surname = r.surname and l.dob = r.dob and l.email = r.email", - "l.first_name = r.first_name and l.surname = r.surname " - "and l.email = r.email", - ], + "blocking_rules_to_generate_predictions": br_for_predict, "retain_matching_columns": True, "retain_intermediate_calculation_columns": True, } + db_api = helper.extra_linker_args()["database_api"] + + cumulative_comparisons_to_be_scored_from_blocking_rules_chart( + table_or_tables=df, + blocking_rule_creators=br_for_predict, + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) linker = Linker(df, settings, **helper.extra_linker_args()) - linker.cumulative_num_comparisons_from_blocking_rules_chart() - df_predict = linker.deterministic_link() clusters = linker.cluster_pairwise_predictions_at_threshold(df_predict) From e8e9d2b06dd40c7c1d650ebf403c958bf3f1ead9 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 18:44:53 +0100 Subject: [PATCH 09/59] fix unlinkables --- splink/linker.py | 7 +++++-- splink/unlinkables.py | 4 ++-- tests/test_full_example_duckdb.py | 14 ++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/splink/linker.py b/splink/linker.py index e7f445fd6e..1e640c27a3 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1423,7 +1423,8 @@ def _self_link(self) -> SplinkDataFrame: """ # Block on uid i.e. create pairwise record comparisons where the uid matches - uid_cols = self._settings_obj.column_info_settings.unique_id_input_columns + settings = self._settings_obj + uid_cols = settings.column_info_settings.unique_id_input_columns uid_l = _composite_unique_id_from_edges_sql(uid_cols, None, "l") uid_r = _composite_unique_id_from_edges_sql(uid_cols, None, "r") @@ -1437,11 +1438,13 @@ def _self_link(self) -> SplinkDataFrame: pipeline = CTEPipeline([nodes_with_tf]) sqls = block_using_rules_sqls( - self, input_tablename_l="__splink__df_concat_with_tf", input_tablename_r="__splink__df_concat_with_tf", blocking_rules=[blocking_rule], link_type="self_link", + columns_to_select_sql=", ".join(settings._columns_to_select_for_blocking), + source_dataset_input_column=settings.column_info_settings.source_dataset_input_column, + unique_id_input_column=settings.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) diff --git a/splink/unlinkables.py b/splink/unlinkables.py index a24d815f97..21eb183b47 100644 --- a/splink/unlinkables.py +++ b/splink/unlinkables.py @@ -19,7 +19,7 @@ def unlinkables_data(linker: Linker) -> dict[str, Any]: linker (Splink): A Splink data linker """ - self_link = linker._self_link() + self_link_df = linker._self_link() pipeline = CTEPipeline() @@ -27,7 +27,7 @@ def unlinkables_data(linker: Linker) -> dict[str, Any]: select round(match_weight, 2) as match_weight, round(match_probability, 5) as match_probability - from {self_link.physical_name} + from {self_link_df.physical_name} """ pipeline.enqueue_sql(sql, "__splink__df_round_self_link") diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index f5f11d47da..fcc9d8fe06 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -8,6 +8,7 @@ import splink.comparison_level_library as cll import splink.comparison_library as cl +from splink.analyse_blocking import count_comparisons_from_blocking_rule from splink.duckdb.database_api import DuckDBAPI from splink.exploratory import completeness_chart, profile_columns from splink.linker import Linker @@ -40,6 +41,15 @@ def test_full_example_duckdb(tmp_path): ] db_api = DuckDBAPI(connection=os.path.join(tmp_path, "duckdb.db")) + + count_comparisons_from_blocking_rule( + table_or_tables=df, + blocking_rule='l.first_name = r.first_name and l."SUR name" = r."SUR name"', + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) + linker = Linker( df, settings=settings_dict, @@ -47,10 +57,6 @@ def test_full_example_duckdb(tmp_path): # output_schema="splink_in_duckdb", ) - linker.count_num_comparisons_from_blocking_rule( - 'l.first_name = r.first_name and l."SUR name" = r."SUR name"' - ) - profile_columns( df, db_api, From 36e5eb7803c64070362f649b6452a0f770305240 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 19:12:59 +0100 Subject: [PATCH 10/59] fix tests of efficient join types --- splink/vertically_concatenate.py | 4 ++-- ...join_type_for_estimate_u_and_predict_are_efficient.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/splink/vertically_concatenate.py b/splink/vertically_concatenate.py index ab145266ed..aca7e7c034 100644 --- a/splink/vertically_concatenate.py +++ b/splink/vertically_concatenate.py @@ -193,7 +193,7 @@ def split_df_concat_with_tf_into_two_tables_sqls( sql = f""" select * from {input_tablename}{sample_text} where {source_dataset_col} = - (select min({source_dataset_col}) from {input_tablename}) + (select min({source_dataset_col}) from {input_tablename}{sample_text}) """ sqls.append( @@ -206,7 +206,7 @@ def split_df_concat_with_tf_into_two_tables_sqls( sql = f""" select * from {input_tablename}{sample_text} where {source_dataset_col} = - (select max({source_dataset_col}) from {input_tablename}) + (select max({source_dataset_col}) from {input_tablename}{sample_text}) """ sqls.append( { diff --git a/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py b/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py index 0b6e05d2b5..5532c201f7 100644 --- a/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py +++ b/tests/test_join_type_for_estimate_u_and_predict_are_efficient.py @@ -125,8 +125,9 @@ def test_dedupe_only(): linker.estimate_u_using_random_sampling(max_pairs=1000) all_log_messages = "\n".join(log_list) all_log_messages = re.sub(r"\s+", " ", all_log_messages) + assert ( - "from __splink__df_concat_with_tf_sample as l inner join __splink__df_concat_with_tf_sample as r" # noqa: E501 + "from __splink__df_concat_sample as l inner join __splink__df_concat_sample as r" # noqa: E501 in all_log_messages ) @@ -181,7 +182,7 @@ def test_link_and_dedupe(): all_log_messages = "\n".join(log_list) all_log_messages = re.sub(r"\s+", " ", all_log_messages) assert ( - "from __splink__df_concat_with_tf_sample as l inner join __splink__df_concat_with_tf_sample as r" # noqa: E501 + "from __splink__df_concat_sample as l inner join __splink__df_concat_sample as r" # noqa: E501 in all_log_messages ) @@ -237,7 +238,7 @@ def test_link_only_two(): all_log_messages = "\n".join(log_list) all_log_messages = re.sub(r"\s+", " ", all_log_messages) assert ( - "from __splink__df_concat_with_tf_sample_left as l inner join __splink__df_concat_with_tf_sample_right as r" # noqa: E501 + "from __splink__df_concat_sample_left as l inner join __splink__df_concat_sample_right as r" # noqa: E501 in all_log_messages ) @@ -294,7 +295,7 @@ def test_link_only_three(): all_log_messages = "\n".join(log_list) all_log_messages = re.sub(r"\s+", " ", all_log_messages) assert ( - "from __splink__df_concat_with_tf_sample as l inner join __splink__df_concat_with_tf_sample as r" # noqa: E501 + "from __splink__df_concat_sample as l inner join __splink__df_concat_sample as r" # noqa: E501 in all_log_messages ) From 309e02dee4cfe391db78e00cbaa6f98d1a022b4c Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 19:25:46 +0100 Subject: [PATCH 11/59] unlinkables --- splink/linker.py | 6 +++--- tests/test_full_example_duckdb.py | 24 ++++++++++++------------ tests/test_full_example_postgres.py | 2 +- tests/test_full_example_spark.py | 2 +- tests/test_full_example_sqlite.py | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/splink/linker.py b/splink/linker.py index 1e640c27a3..5a64b44415 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2471,7 +2471,7 @@ def waterfall_chart( def unlinkables_chart( self, x_col: str = "match_weight", - source_dataset: str | None = None, + name_of_data_in_title: str | None = None, as_dict: bool = False, ) -> ChartReturnType: """Generate an interactive chart displaying the proportion of records that @@ -2483,7 +2483,7 @@ def unlinkables_chart( Args: x_col (str, optional): Column to use for the x-axis. Defaults to "match_weight". - source_dataset (str, optional): Name of the source dataset to use for + name_of_data_in_title (str, optional): Name of the source dataset to use for the title of the output chart. as_dict (bool, optional): If True, return a dict version of the chart. @@ -2506,7 +2506,7 @@ def unlinkables_chart( # Link our initial df on itself and calculate the % of unlinkable entries records = unlinkables_data(self) - return unlinkables_chart(records, x_col, source_dataset, as_dict) + return unlinkables_chart(records, x_col, name_of_data_in_title, as_dict) def comparison_viewer_dashboard( self, diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index fcc9d8fe06..55beda28d3 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -106,7 +106,7 @@ def test_full_example_duckdb(tmp_path): out_path=os.path.join(tmp_path, "test_cluster_studio.html"), ) - linker.unlinkables_chart(source_dataset="Testing") + linker.unlinkables_chart(name_of_data_in_title="Testing") _test_table_registration(linker) @@ -120,21 +120,21 @@ def test_full_example_duckdb(tmp_path): "cluster": 10000, } - linker.find_matches_to_new_records( - [record], blocking_rules=[], match_weight_threshold=-10000 - ) + # linker.find_matches_to_new_records( + # [record], blocking_rules=[], match_weight_threshold=-10000 + # ) - # Test saving and loading - path = os.path.join(tmp_path, "model.json") - linker.save_model_to_json(path) + # # Test saving and loading + # path = os.path.join(tmp_path, "model.json") + # linker.save_model_to_json(path) - db_api = DuckDBAPI() - linker_2 = Linker(df, settings=simple_settings, database_api=db_api) + # db_api = DuckDBAPI() + # linker_2 = Linker(df, settings=simple_settings, database_api=db_api) - linker_2 = Linker(df, database_api=db_api, settings=path) + # linker_2 = Linker(df, database_api=db_api, settings=path) - # Test that writing to files works as expected - _test_write_functionality(linker_2, pd.read_csv) + # # Test that writing to files works as expected + # _test_write_functionality(linker_2, pd.read_csv) # Create some dummy dataframes for the link only test diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py index 8fdffa3319..86db08b666 100644 --- a/tests/test_full_example_postgres.py +++ b/tests/test_full_example_postgres.py @@ -84,7 +84,7 @@ def test_full_example_postgres(tmp_path, pg_engine): out_path=os.path.join(tmp_path, "test_cluster_studio.html"), ) - linker.unlinkables_chart(source_dataset="Testing") + linker.unlinkables_chart(name_of_data_in_title="Testing") _test_table_registration(linker) diff --git a/tests/test_full_example_spark.py b/tests/test_full_example_spark.py index 13710ccf93..345a6501fc 100644 --- a/tests/test_full_example_spark.py +++ b/tests/test_full_example_spark.py @@ -111,7 +111,7 @@ def spark_csv_read(x): out_path=os.path.join(tmp_path, "test_cluster_studio.html"), ) - linker.unlinkables_chart(source_dataset="Testing") + linker.unlinkables_chart(name_of_data_in_title="Testing") # Test that writing to files works as expected # spark_csv_read = lambda x: linker.spark.read.csv(x, header=True).toPandas() # _test_write_functionality(linker, spark_csv_read) diff --git a/tests/test_full_example_sqlite.py b/tests/test_full_example_sqlite.py index 10d752bfe0..f0a7e89af2 100644 --- a/tests/test_full_example_sqlite.py +++ b/tests/test_full_example_sqlite.py @@ -56,7 +56,7 @@ def test_full_example_sqlite(tmp_path): linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.5) - linker.unlinkables_chart(source_dataset="Testing") + linker.unlinkables_chart(name_of_data_in_title="Testing") _test_table_registration(linker) From 9d485e05077f542de17ecf5fc7412f75a97dcd32 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 14 May 2024 19:32:24 +0100 Subject: [PATCH 12/59] find matches to new records --- splink/linker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/splink/linker.py b/splink/linker.py index 5a64b44415..ed45b57b88 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1180,8 +1180,8 @@ def predict( def find_matches_to_new_records( self, records_or_tablename: AcceptableInputTableType | str, - blocking_rules: list[BlockingRule | dict[str, Any] | str] - | BlockingRule + blocking_rules: list[BlockingRuleCreator | dict[str, Any] | str] + | BlockingRuleCreator | dict[str, Any] | str = [], match_weight_threshold: float = -4, @@ -1270,13 +1270,15 @@ def find_matches_to_new_records( pipeline = add_unique_id_and_source_dataset_cols_if_needed( self, new_records_df, pipeline ) - + settings = self._settings_obj sqls = block_using_rules_sqls( - self, input_tablename_l="__splink__df_concat_with_tf", input_tablename_r="__splink__df_new_records_with_tf", blocking_rules=blocking_rule_list, link_type="two_dataset_link_only", + columns_to_select_sql=", ".join(settings._columns_to_select_for_blocking), + source_dataset_input_column=settings.column_info_settings.source_dataset_input_column, + unique_id_input_column=settings.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) From a553b5c08a3f8e8097dde79010c8b192cbcc00f3 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 06:12:45 +0100 Subject: [PATCH 13/59] m training --- splink/m_training.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/splink/m_training.py b/splink/m_training.py index c607abcf4e..3a446fb191 100644 --- a/splink/m_training.py +++ b/splink/m_training.py @@ -37,11 +37,13 @@ def estimate_m_values_from_label_column(linker, df_dict, label_colname): pipeline = CTEPipeline([nodes_with_tf]) sqls = block_using_rules_sqls( - training_linker, input_tablename_l="__splink__df_concat_with_tf", input_tablename_r="__splink__df_concat_with_tf", blocking_rules=[BlockingRule(f"l.{label_colname} = r.{label_colname}")], - link_type=training_linker._settings_obj._link_type, + link_type=settings_obj._link_type, + columns_to_select_sql=", ".join(settings_obj._columns_to_select_for_blocking), + source_dataset_input_column=settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=settings_obj.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) From 4a454841b498f7b4ced356cca24b9462abd87213 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 06:17:20 +0100 Subject: [PATCH 14/59] fix test new db api test --- tests/test_full_example_deterministic_link.py | 2 +- tests/test_new_db_api.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/test_full_example_deterministic_link.py b/tests/test_full_example_deterministic_link.py index 476921f2f1..d72ea935ad 100644 --- a/tests/test_full_example_deterministic_link.py +++ b/tests/test_full_example_deterministic_link.py @@ -28,7 +28,7 @@ def test_deterministic_link_full_example(dialect, tmp_path, test_helpers): "retain_matching_columns": True, "retain_intermediate_calculation_columns": True, } - db_api = helper.extra_linker_args()["database_api"] + db_api = helper.DatabaseAPI(**helper.db_api_args()) cumulative_comparisons_to_be_scored_from_blocking_rules_chart( table_or_tables=df, diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py index 7b4316ac9b..034ac018b2 100644 --- a/tests/test_new_db_api.py +++ b/tests/test_new_db_api.py @@ -2,6 +2,10 @@ import splink.comparison_level_library as cll import splink.comparison_library as cl +from splink import block_on +from splink.analyse_blocking import ( + cumulative_comparisons_to_be_scored_from_blocking_rules_chart, +) from splink.exploratory import profile_columns from splink.linker import Linker @@ -111,12 +115,17 @@ def test_charts(dialect, test_helpers, tmp_path): df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv") db_api = helper.DatabaseAPI(**helper.db_api_args()) - linker = Linker( - df, - cl_settings, - db_api, + + cumulative_comparisons_to_be_scored_from_blocking_rules_chart( + table_or_tables=df, + blocking_rule_creators=[block_on("dob"), block_on("first_name")], + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", + source_dataset_column_name="source_dataset", ) - linker.cumulative_num_comparisons_from_blocking_rules_chart() + + linker = Linker(df, cl_settings, db_api) linker.estimate_probability_two_random_records_match( ["l.first_name = r.first_name AND l.surname = r.surname"], From 3b0056fba9a175d2c7a726863bb8af848852e000 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 06:20:53 +0100 Subject: [PATCH 15/59] duckdb tests pass --- tests/test_settings_options.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_settings_options.py b/tests/test_settings_options.py index 2c914cf94c..f04ab9754c 100644 --- a/tests/test_settings_options.py +++ b/tests/test_settings_options.py @@ -3,6 +3,7 @@ import pandas as pd import splink.comparison_library as cl +from splink import block_on from .decorator import mark_with_dialects_excluding @@ -32,8 +33,11 @@ def test_model_heavily_customised_settings(test_helpers, dialect, tmp_path): settings = { "link_type": "link_and_dedupe", "blocking_rules_to_generate_predictions": [ - "l.city = r.city AND l.dob = r.dob", - "l.first_name = r.first_name and l.surname = r.surname", + block_on("city", "dob"), + """l.first_name = r.first_name + and l.surname = r.surname + and substr(l.first_name, 1, 1) = 'j' + """, ], "comparisons": [ cl.ExactMatch("first_name"), @@ -54,7 +58,7 @@ def test_model_heavily_customised_settings(test_helpers, dialect, tmp_path): } linker = helper.Linker([df_l, df_r], settings, **helper.extra_linker_args()) # run through a few common operations to check functioning - linker.estimate_probability_two_random_records_match("l.dob = r.dob", 0.5) + linker.estimate_probability_two_random_records_match(["l.dob = r.dob"], 0.5) linker.estimate_u_using_random_sampling(2e4) linker.estimate_parameters_using_expectation_maximisation("l.dob = r.dob") df_predict = linker.predict(0.1) From 7ddf4e267315e32bf9dd4dfb9e4d4e37aeed5396 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 07:19:09 +0100 Subject: [PATCH 16/59] start to fix analyse blocking tests --- splink/analyse_blocking.py | 15 +- tests/test_analyse_blocking.py | 801 +++++++++++++++++---------------- 2 files changed, 422 insertions(+), 394 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 0a0f32c848..999660f2ed 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -40,21 +40,24 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( ) -> str: input_dataframes = list(input_data_dict.values()) - if len(input_dataframes) > 1: + two_dataset_link_only = link_type == "link_only" and len(input_dataframes) == 2 + if two_dataset_link_only: + link_type = "two_dataset_link_only" + + if len(input_dataframes) > 1 and not two_dataset_link_only: unique_id_cols = [ - InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), InputColumn("source_dataset", sql_dialect=db_api.sql_dialect.name), + InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), ] else: unique_id_cols = [ InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), ] + where_condition = _sql_gen_where_condition(link_type, unique_id_cols) sqls = [] - two_dataset_link_only = link_type == "link_only" and len(input_dataframes) == 2 - if two_dataset_link_only: input_tablename_l = input_dataframes[0].physical_name input_tablename_r = input_dataframes[1].physical_name @@ -393,7 +396,7 @@ def _count_comparisons_generated_from_blocking_rule( blocking_rule: BlockingRule, link_type: link_type_type, db_api: DatabaseAPI, - compute_post_filter_count: bool = False, + compute_post_filter_count: bool, max_rows_limit: int = 1e9, unique_id_column_name: str = "unique_id", ): @@ -474,7 +477,7 @@ def count_comparisons_from_blocking_rule( link_type: link_type_type, db_api: DatabaseAPI, unique_id_column_name: str, - compute_post_filter_count: bool = False, + compute_post_filter_count: bool = True, max_rows_limit: int = 1e9, ): if not isinstance(blocking_rule, BlockingRule): diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index ca7ae534f3..acfca9d78c 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -1,7 +1,9 @@ import duckdb import pandas as pd -from splink.analyse_blocking import cumulative_comparisons_generated_by_blocking_rules +from splink.analyse_blocking import ( + count_comparisons_from_blocking_rule, +) from splink.blocking import BlockingRule from splink.blocking_rule_library import CustomRule, Or, block_on from splink.duckdb.database_api import DuckDBAPI @@ -14,7 +16,6 @@ @mark_with_dialects_excluding() def test_analyse_blocking_slow_methodology(test_helpers, dialect): helper = test_helpers[dialect] - Linker = helper.Linker df_1 = pd.DataFrame( [ @@ -32,424 +33,448 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): {"unique_id": 3, "first_name": "Jayne", "surname": "Tailor"}, ] ) - settings = {"link_type": "dedupe_only"} - linker = Linker(df_1, settings, **helper.extra_linker_args()) - res = linker.count_num_comparisons_from_blocking_rule( - "1=1", + df_3 = pd.DataFrame( + [ + {"unique_id": 1, "first_name": "John", "surname": "Smith"}, + {"unique_id": 2, "first_name": "Mary", "surname": "Jones"}, + ] + ) + + db_api = helper.DatabaseAPI(**helper.db_api_args()) + args = { + "link_type": "dedupe_only", + "db_api": db_api, + "unique_id_column_name": "unique_id", + } + + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=df_1, blocking_rule="1=1", **args ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 / 2 - res = linker.count_num_comparisons_from_blocking_rule( - "l.first_name = r.first_name", + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=df_1, blocking_rule=block_on("first_name"), **args ) + + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 - settings = {"link_type": "link_only"} - linker = Linker([df_1, df_2], settings, **helper.extra_linker_args()) - res = linker.count_num_comparisons_from_blocking_rule( - "1=1", + args["link_type"] = "link_only" + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2], blocking_rule="1=1", **args ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] + assert res == 4 * 3 - res = linker.count_num_comparisons_from_blocking_rule( - "l.surname = r.surname", + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2], blocking_rule=block_on("surname"), **args ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 - res = linker.count_num_comparisons_from_blocking_rule( - "l.first_name = r.first_name", + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2], blocking_rule=block_on("first_name"), **args ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 3 - settings = {"link_type": "link_and_dedupe"} - - linker = Linker([df_1, df_2], settings, **helper.extra_linker_args()) + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2, df_3], blocking_rule="1=1", **args + ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] + assert res == 4 * 3 + 4 * 2 + 2 * 3 - res = linker.count_num_comparisons_from_blocking_rule( - "1=1", + args["link_type"] = "link_and_dedupe" + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2], blocking_rule="1=1", **args ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] expected = 4 * 3 + (4 * 3 / 2) + (3 * 2 / 2) assert res == expected rule = "l.first_name = r.first_name and l.surname = r.surname" - res = linker.count_num_comparisons_from_blocking_rule( - rule, + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2], blocking_rule=rule, **args ) - + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 rule = block_on("first_name", "surname") - res = linker.count_num_comparisons_from_blocking_rule( - rule, - ) - - -def validate_blocking_output(linker, expected_out, **kwargs): - records = cumulative_comparisons_generated_by_blocking_rules(linker, **kwargs) - - assert expected_out["row_count"] == list(map(lambda x: x["row_count"], records)) - - assert expected_out["cumulative_rows"] == list( - map(lambda x: x["cumulative_rows"], records) - ) - - assert expected_out["cartesian"] == records[0]["cartesian"] - - -@mark_with_dialects_excluding() -def test_blocking_records_accuracy(test_helpers, dialect): - from numpy import nan - - helper = test_helpers[dialect] - Linker = helper.Linker - - # resolve an issue w/ pyspark nulls - - df = [ - {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, - {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, - {"unique_id": 3, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, - {"unique_id": 4, "first_name": "Kim", "surname": "Lee", "dob": None}, - ] - df = pd.DataFrame(df).fillna(nan).replace([nan], [None]) - - settings = { - "link_type": "dedupe_only", - "blocking_rules_to_generate_predictions": [ - "l.first_name = r.first_name", - ], - "comparisons": [], - "retain_matching_columns": True, - "retain_intermediate_calculation_columns": True, - "em_convergence": 0.001, - "max_iterations": 20, - } - - linker_settings = Linker(df, settings, **helper.extra_linker_args()) - n = len(df) - # dedupe only - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [1], - "cumulative_rows": [1], - "cartesian": n * (n - 1) / 2, - }, - blocking_rules=None, - ) - - # dedupe only with additional brs - blocking_rules = [ - "l.surname = r.surname", - "l.first_name = r.first_name", - ] - - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [1, 1], - "cumulative_rows": [1, 2], - "cartesian": n * (n - 1) / 2, - }, - blocking_rules=blocking_rules, - ) - - blocking_rules = [ - block_on("first_name").get_blocking_rule(dialect), - block_on("first_name", "surname").get_blocking_rule(dialect), - "l.dob = r.dob", - ] - - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [1, 0, 1], - "cumulative_rows": [1, 1, 2], - "cartesian": n * (n - 1) / 2, - }, - blocking_rules=blocking_rules, - ) - - # link and dedupe + link only - df_l = [ - {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, - {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, - ] - - df_l = pd.DataFrame(df_l) - - df_r = [ - {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, - {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, - ] - - df_r = pd.DataFrame(df_r).fillna(nan).replace([nan], [None]) - - blocking_rules = [ - "l.surname = r.surname", # 2l:2r, - Or( - block_on("first_name"), CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)") - ).get_blocking_rule(dialect), # 1r:1r, 1l:2l, 1l:2r - "l.surname = r.surname", - ] - - settings = {"link_type": "link_and_dedupe"} - linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [1, 3, 0], - "cumulative_rows": [1, 4, 4], - "cartesian": 1 + 1 + 4, # within, within, between - }, - blocking_rules=blocking_rules, - ) - - blocking_rules = [ - "l.surname = r.surname", # 2l:2r, - Or( - block_on("first_name"), - CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)"), - ).get_blocking_rule(dialect), # 1l:1r, 1l:2r - "l.surname = r.surname", - ] - - settings = {"link_type": "link_only"} - linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [1, 2, 0], - "cumulative_rows": [1, 3, 3], - "cartesian": 4, - }, - blocking_rules=blocking_rules, + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_1, df_2], blocking_rule=rule, **args ) + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] + assert res == 1 - # link and dedupe - df_1 = [ - {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, - {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, - ] - - df_1 = pd.DataFrame(df_l) - - df_2 = [ - {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, - {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, - ] - - df_2 = pd.DataFrame(df_2).fillna(nan).replace([nan], [None]) - - df_3 = [ - {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, - ] - - df_3 = pd.DataFrame(df_3) - - settings = {"link_type": "link_and_dedupe"} - blocking_rules = [ - "l.surname = r.surname", - "l.first_name = r.first_name", - ] - - linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [2, 2], - "cumulative_rows": [2, 4], - "cartesian": 5 * 4 / 2, - }, - blocking_rules=blocking_rules, - ) - - settings = {"link_type": "link_only"} - blocking_rules = [ - "l.surname = r.surname", - "l.first_name = r.first_name", - ] - - linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) - validate_blocking_output( - linker_settings, - expected_out={ - "row_count": [2, 2], - "cumulative_rows": [2, 4], - "cartesian": 8, - }, - blocking_rules=blocking_rules, - ) - - blocking_rules_df = cumulative_comparisons_generated_by_blocking_rules( - linker_settings, blocking_rules=blocking_rules, return_dataframe=True - ) - - expected_row_count = pd.DataFrame({"row_count": [2, 2]}) - assert (blocking_rules_df["row_count"] == expected_row_count["row_count"]).all() - - -def test_analyse_blocking_fast_methodology(): - df_1 = pd.DataFrame( - [ - {"unique_id": 1, "first_name": "John", "surname": "Smith"}, - {"unique_id": 2, "first_name": "John", "surname": "Smith"}, - {"unique_id": 3, "first_name": "John", "surname": "Jones"}, - {"unique_id": 4, "first_name": "Mary", "surname": "Jones"}, - {"unique_id": 5, "first_name": "Brian", "surname": "Taylor"}, - ] - ) - - df_2 = pd.DataFrame( - [ - {"unique_id": 1, "first_name": "John", "surname": "Smith"}, - {"unique_id": 2, "first_name": "John", "surname": "Smith"}, - {"unique_id": 3, "first_name": "John", "surname": "Jones"}, - ] - ) - settings = {"link_type": "dedupe_only"} - db_api = DuckDBAPI() - - linker = Linker(df_1, settings, database_api=db_api) - - res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( - "1=1", - ) - assert res == 5 * 5 - - settings = {"link_type": "dedupe_only"} - db_api = DuckDBAPI() - - linker = Linker(df_1, settings, database_api=db_api) - - res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( - "l.first_name = r.first_name OR l.surname = r.surname", - ) - assert res == 5 * 5 - - res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( - "l.first_name = r.first_name AND levenshtein(l.surname, r.surname) <2", - ) - assert res == 3 * 3 + 1 * 1 + 1 * 1 - - settings = {"link_type": "link_and_dedupe"} - db_api = DuckDBAPI() - - linker = Linker([df_1, df_2], settings, database_api=db_api) - - res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( - "l.first_name = r.first_name" - ) - assert res == 6 * 6 + 1 * 1 + 1 * 1 - - settings = {"link_type": "link_only"} - db_api = DuckDBAPI() - - linker = Linker([df_1, df_2], settings, database_api=db_api) - res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( - "l.first_name = r.first_name" - ) - assert res == 3 * 3 - - # Test a series of blocking rules with different edge cases. - # Assert that the naive methodology gives the same result as the new methodlogy - - df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") - - blocking_rules = [ - "l.first_name = r.first_name", - "l.first_name = r.first_name AND l.surname = r.surname", - "substr(l.first_name,2,3) = substr(r.first_name,3,4)", - "substr(l.first_name,1,1) = substr(r.surname,1,1) and l.dob = r.dob", - "l.first_name = r.first_name and levenshtein(l.dob, r.dob) > -1", - "l.dob = r.dob and substr(l.first_name,2,3) = substr(r.first_name,3,4)", - ] - - sql_template = """ - select count(*) - from df as l - inner join df as r - on {blocking_rule} - """ - - results = {} - for br in blocking_rules: - sql = sql_template.format(blocking_rule=br) - res = duckdb.sql(sql).df() - results[br] = {"count_from_join_dedupe_only": res.iloc[0][0]} - - db_api = DuckDBAPI() - - linker = Linker(df, {"link_type": "dedupe_only"}, database_api=db_api) - for br in blocking_rules: - c = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) - results[br]["count_from_efficient_fn_dedupe_only"] = c - - for br in blocking_rules: - assert ( - results[br]["count_from_join_dedupe_only"] - == results[br]["count_from_efficient_fn_dedupe_only"] - ) - - # Link only - df_l = df.iloc[::2].copy() # even-indexed rows (starting from 0) - df_r = df.iloc[1::2].copy() # odd-indexed rows (starting from 1) - - sql_template = """ - select count(*) - from df_l as l - inner join df_r as r - on {blocking_rule} - """ - - results = {} - for br in blocking_rules: - sql = sql_template.format(blocking_rule=br) - res = duckdb.sql(sql).df() - results[br] = {"count_from_join_link_only": res.iloc[0][0]} - - db_api = DuckDBAPI() - - linker = Linker([df_l, df_r], {"link_type": "link_only"}, database_api=db_api) - for br in blocking_rules: - c = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) - results[br]["count_from_efficient_fn_link_only"] = c - - for br in blocking_rules: - assert ( - results[br]["count_from_join_link_only"] - == results[br]["count_from_efficient_fn_link_only"] - ) - - -def test_blocking_rule_accepts_different_dialects(): - br = "l.first_name = r.first_name" - br = BlockingRule(br, sqlglot_dialect="spark") - assert br._equi_join_conditions == [("first_name", "first_name")] - - br = "l.`hi THERE` = r.`hi THERE`" - br = BlockingRule(br, sqlglot_dialect="spark") - - assert br._equi_join_conditions == [("`hi THERE`", "`hi THERE`")] +# def validate_blocking_output(linker, expected_out, **kwargs): +# records = cumulative_comparisons_generated_by_blocking_rules(linker, **kwargs) + +# assert expected_out["row_count"] == list(map(lambda x: x["row_count"], records)) + +# assert expected_out["cumulative_rows"] == list( +# map(lambda x: x["cumulative_rows"], records) +# ) + +# assert expected_out["cartesian"] == records[0]["cartesian"] + + +# @mark_with_dialects_excluding() +# def test_blocking_records_accuracy(test_helpers, dialect): +# from numpy import nan + +# helper = test_helpers[dialect] +# Linker = helper.Linker + +# # resolve an issue w/ pyspark nulls + +# df = [ +# {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, +# {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, +# {"unique_id": 3, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, +# {"unique_id": 4, "first_name": "Kim", "surname": "Lee", "dob": None}, +# ] +# df = pd.DataFrame(df).fillna(nan).replace([nan], [None]) + +# settings = { +# "link_type": "dedupe_only", +# "blocking_rules_to_generate_predictions": [ +# "l.first_name = r.first_name", +# ], +# "comparisons": [], +# "retain_matching_columns": True, +# "retain_intermediate_calculation_columns": True, +# "em_convergence": 0.001, +# "max_iterations": 20, +# } + +# linker_settings = Linker(df, settings, **helper.extra_linker_args()) +# n = len(df) +# # dedupe only +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [1], +# "cumulative_rows": [1], +# "cartesian": n * (n - 1) / 2, +# }, +# blocking_rules=None, +# ) + +# # dedupe only with additional brs +# blocking_rules = [ +# "l.surname = r.surname", +# "l.first_name = r.first_name", +# ] + +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [1, 1], +# "cumulative_rows": [1, 2], +# "cartesian": n * (n - 1) / 2, +# }, +# blocking_rules=blocking_rules, +# ) + +# blocking_rules = [ +# block_on("first_name").get_blocking_rule(dialect), +# block_on("first_name", "surname").get_blocking_rule(dialect), +# "l.dob = r.dob", +# ] + +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [1, 0, 1], +# "cumulative_rows": [1, 1, 2], +# "cartesian": n * (n - 1) / 2, +# }, +# blocking_rules=blocking_rules, +# ) + +# # link and dedupe + link only +# df_l = [ +# {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, +# {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, +# ] + +# df_l = pd.DataFrame(df_l) + +# df_r = [ +# {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, +# {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, +# ] + +# df_r = pd.DataFrame(df_r).fillna(nan).replace([nan], [None]) + +# blocking_rules = [ +# "l.surname = r.surname", # 2l:2r, +# Or( +# block_on("first_name"), CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)") +# ).get_blocking_rule(dialect), # 1r:1r, 1l:2l, 1l:2r +# "l.surname = r.surname", +# ] + +# settings = {"link_type": "link_and_dedupe"} +# linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [1, 3, 0], +# "cumulative_rows": [1, 4, 4], +# "cartesian": 1 + 1 + 4, # within, within, between +# }, +# blocking_rules=blocking_rules, +# ) + +# blocking_rules = [ +# "l.surname = r.surname", # 2l:2r, +# Or( +# block_on("first_name"), +# CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)"), +# ).get_blocking_rule(dialect), # 1l:1r, 1l:2r +# "l.surname = r.surname", +# ] + +# settings = {"link_type": "link_only"} +# linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [1, 2, 0], +# "cumulative_rows": [1, 3, 3], +# "cartesian": 4, +# }, +# blocking_rules=blocking_rules, +# ) + +# # link and dedupe +# df_1 = [ +# {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, +# {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, +# ] + +# df_1 = pd.DataFrame(df_l) + +# df_2 = [ +# {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, +# {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, +# ] + +# df_2 = pd.DataFrame(df_2).fillna(nan).replace([nan], [None]) + +# df_3 = [ +# {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, +# ] + +# df_3 = pd.DataFrame(df_3) + +# settings = {"link_type": "link_and_dedupe"} +# blocking_rules = [ +# "l.surname = r.surname", +# "l.first_name = r.first_name", +# ] + +# linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [2, 2], +# "cumulative_rows": [2, 4], +# "cartesian": 5 * 4 / 2, +# }, +# blocking_rules=blocking_rules, +# ) + +# settings = {"link_type": "link_only"} +# blocking_rules = [ +# "l.surname = r.surname", +# "l.first_name = r.first_name", +# ] + +# linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) +# validate_blocking_output( +# linker_settings, +# expected_out={ +# "row_count": [2, 2], +# "cumulative_rows": [2, 4], +# "cartesian": 8, +# }, +# blocking_rules=blocking_rules, +# ) + +# blocking_rules_df = cumulative_comparisons_generated_by_blocking_rules( +# linker_settings, blocking_rules=blocking_rules, return_dataframe=True +# ) + +# expected_row_count = pd.DataFrame({"row_count": [2, 2]}) +# assert (blocking_rules_df["row_count"] == expected_row_count["row_count"]).all() + + +# def test_analyse_blocking_fast_methodology(): +# df_1 = pd.DataFrame( +# [ +# {"unique_id": 1, "first_name": "John", "surname": "Smith"}, +# {"unique_id": 2, "first_name": "John", "surname": "Smith"}, +# {"unique_id": 3, "first_name": "John", "surname": "Jones"}, +# {"unique_id": 4, "first_name": "Mary", "surname": "Jones"}, +# {"unique_id": 5, "first_name": "Brian", "surname": "Taylor"}, +# ] +# ) + +# df_2 = pd.DataFrame( +# [ +# {"unique_id": 1, "first_name": "John", "surname": "Smith"}, +# {"unique_id": 2, "first_name": "John", "surname": "Smith"}, +# {"unique_id": 3, "first_name": "John", "surname": "Jones"}, +# ] +# ) +# settings = {"link_type": "dedupe_only"} +# db_api = DuckDBAPI() + +# linker = Linker(df_1, settings, database_api=db_api) + +# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( +# "1=1", +# ) +# assert res == 5 * 5 + +# settings = {"link_type": "dedupe_only"} +# db_api = DuckDBAPI() + +# linker = Linker(df_1, settings, database_api=db_api) + +# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( +# "l.first_name = r.first_name OR l.surname = r.surname", +# ) +# assert res == 5 * 5 + +# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( +# "l.first_name = r.first_name AND levenshtein(l.surname, r.surname) <2", +# ) +# assert res == 3 * 3 + 1 * 1 + 1 * 1 + +# settings = {"link_type": "link_and_dedupe"} +# db_api = DuckDBAPI() + +# linker = Linker([df_1, df_2], settings, database_api=db_api) + +# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( +# "l.first_name = r.first_name" +# ) +# assert res == 6 * 6 + 1 * 1 + 1 * 1 + +# settings = {"link_type": "link_only"} +# db_api = DuckDBAPI() + +# linker = Linker([df_1, df_2], settings, database_api=db_api) + +# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( +# "l.first_name = r.first_name" +# ) +# assert res == 3 * 3 + +# # Test a series of blocking rules with different edge cases. +# # Assert that the naive methodology gives the same result as the new methodlogy + +# df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + +# blocking_rules = [ +# "l.first_name = r.first_name", +# "l.first_name = r.first_name AND l.surname = r.surname", +# "substr(l.first_name,2,3) = substr(r.first_name,3,4)", +# "substr(l.first_name,1,1) = substr(r.surname,1,1) and l.dob = r.dob", +# "l.first_name = r.first_name and levenshtein(l.dob, r.dob) > -1", +# "l.dob = r.dob and substr(l.first_name,2,3) = substr(r.first_name,3,4)", +# ] + +# sql_template = """ +# select count(*) +# from df as l +# inner join df as r +# on {blocking_rule} +# """ + +# results = {} +# for br in blocking_rules: +# sql = sql_template.format(blocking_rule=br) +# res = duckdb.sql(sql).df() +# results[br] = {"count_from_join_dedupe_only": res.iloc[0][0]} + +# db_api = DuckDBAPI() + +# linker = Linker(df, {"link_type": "dedupe_only"}, database_api=db_api) +# for br in blocking_rules: +# c = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) +# results[br]["count_from_efficient_fn_dedupe_only"] = c + +# for br in blocking_rules: +# assert ( +# results[br]["count_from_join_dedupe_only"] +# == results[br]["count_from_efficient_fn_dedupe_only"] +# ) + +# # Link only +# df_l = df.iloc[::2].copy() # even-indexed rows (starting from 0) +# df_r = df.iloc[1::2].copy() # odd-indexed rows (starting from 1) + +# sql_template = """ +# select count(*) +# from df_l as l +# inner join df_r as r +# on {blocking_rule} +# """ + +# results = {} +# for br in blocking_rules: +# sql = sql_template.format(blocking_rule=br) +# res = duckdb.sql(sql).df() +# results[br] = {"count_from_join_link_only": res.iloc[0][0]} +# db_api = DuckDBAPI() -@mark_with_dialects_excluding() -def test_cumulative_br_funs(test_helpers, dialect): - helper = test_helpers[dialect] - Linker = helper.Linker +# linker = Linker([df_l, df_r], {"link_type": "link_only"}, database_api=db_api) +# for br in blocking_rules: +# c = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) +# results[br]["count_from_efficient_fn_link_only"] = c - df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv") +# for br in blocking_rules: +# assert ( +# results[br]["count_from_join_link_only"] +# == results[br]["count_from_efficient_fn_link_only"] +# ) - linker = Linker(df, get_settings_dict(), **helper.extra_linker_args()) - linker.cumulative_comparisons_from_blocking_rules_records() - linker.cumulative_comparisons_from_blocking_rules_records( - ["l.first_name = r.first_name", block_on("surname")] - ) - linker.cumulative_num_comparisons_from_blocking_rules_chart( - ["l.first_name = r.first_name", block_on("surname")] - ) +# def test_blocking_rule_accepts_different_dialects(): +# br = "l.first_name = r.first_name" +# br = BlockingRule(br, sqlglot_dialect="spark") +# assert br._equi_join_conditions == [("first_name", "first_name")] + +# br = "l.`hi THERE` = r.`hi THERE`" +# br = BlockingRule(br, sqlglot_dialect="spark") + +# assert br._equi_join_conditions == [("`hi THERE`", "`hi THERE`")] + - assert linker.count_num_comparisons_from_blocking_rule(block_on("surname")) == 3167 +# @mark_with_dialects_excluding() +# def test_cumulative_br_funs(test_helpers, dialect): +# helper = test_helpers[dialect] +# Linker = helper.Linker + +# df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + +# linker = Linker(df, get_settings_dict(), **helper.extra_linker_args()) +# linker.cumulative_comparisons_from_blocking_rules_records() +# linker.cumulative_comparisons_from_blocking_rules_records( +# ["l.first_name = r.first_name", block_on("surname")] +# ) + +# linker.cumulative_num_comparisons_from_blocking_rules_chart( +# ["l.first_name = r.first_name", block_on("surname")] +# ) + +# assert linker.count_num_comparisons_from_blocking_rule(block_on("surname")) == 3167 From 5a35dc39ecaad603b7940c32d9ab52db4365101c Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 08:11:13 +0100 Subject: [PATCH 17/59] br tests --- splink/analyse_blocking.py | 41 +++++---- tests/test_analyse_blocking.py | 146 ++++++++++++++++----------------- 2 files changed, 93 insertions(+), 94 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 999660f2ed..bfd3cec011 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -341,27 +341,26 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( """ pipeline.enqueue_sql(sql, "__splink__df_count_cumulative_blocks") - sql = f""" - SELECT - row_count, - match_key, - cast(SUM(row_count) OVER (ORDER BY match_key) as int) AS cumulative_rows, - cast(SUM(row_count) OVER (ORDER BY match_key) - row_count as int) AS start, - cast({cartesian_count} as int) as cartesian - - FROM - __splink__df_count_cumulative_blocks - """ + result_df = db_api.sql_pipeline_to_splink_dataframe(pipeline).as_pandas_dataframe() - pipeline.enqueue_sql(sql, "__splink__df_count_cumulative_blocks_2") + # The above table won't include rules that have no matches + all_rules_df = pd.DataFrame( + { + "match_key": [str(i) for i in range(len(blocking_rules))], + "blocking_rule": [br.blocking_rule_sql for br in blocking_rules], + } + ) - records = db_api.sql_pipeline_to_splink_dataframe(pipeline).as_record_dict() + complete_df = all_rules_df.merge(result_df, on="match_key", how="left").fillna( + {"row_count": 0} + ) - # Lookup table match_key -> blocking_rule - rules = {i: r.blocking_rule_sql for i, r in enumerate(blocking_rules)} + complete_df["cumulative_rows"] = complete_df["row_count"].cumsum().astype(int) + complete_df["start"] = complete_df["cumulative_rows"] - complete_df["row_count"] + complete_df["cartesian"] = cartesian_count - for r in records: - r["blocking_rule"] = rules[int(r["match_key"])] + for c in ["row_count", "cumulative_rows", "cartesian", "start"]: + complete_df[c] = complete_df[c].astype(int) [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables] @@ -373,8 +372,8 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( "match_key", "start", ] - if len(records) > 0: - return pd.DataFrame(records)[col_order] + if len(complete_df) > 0: + return complete_df[col_order] else: return pd.DataFrame( [ @@ -504,8 +503,8 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], link_type: link_type_type, db_api: DatabaseAPI, - max_rows_limit: int = 1e9, unique_id_column_name: str, + max_rows_limit: int = 1e9, source_dataset_column_name: str = None, ): splink_df_dict = db_api.register_multiple_tables(table_or_tables) @@ -536,8 +535,8 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], link_type: link_type_type, db_api: DatabaseAPI, - max_rows_limit: int = 1e9, unique_id_column_name: str, + max_rows_limit: int = 1e9, source_dataset_column_name: str = None, ): splink_df_dict = db_api.register_multiple_tables(table_or_tables) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index acfca9d78c..70af87672b 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -3,6 +3,7 @@ from splink.analyse_blocking import ( count_comparisons_from_blocking_rule, + cumulative_comparisons_to_be_scored_from_blocking_rules_data, ) from splink.blocking import BlockingRule from splink.blocking_rule_library import CustomRule, Or, block_on @@ -110,91 +111,90 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): assert res == 1 -# def validate_blocking_output(linker, expected_out, **kwargs): -# records = cumulative_comparisons_generated_by_blocking_rules(linker, **kwargs) +def validate_blocking_output(comparison_count_args, expected_out): + records = cumulative_comparisons_to_be_scored_from_blocking_rules_data( + **comparison_count_args + ).to_dict(orient="records") -# assert expected_out["row_count"] == list(map(lambda x: x["row_count"], records)) + assert expected_out["row_count"] == list(map(lambda x: x["row_count"], records)) -# assert expected_out["cumulative_rows"] == list( -# map(lambda x: x["cumulative_rows"], records) -# ) + assert expected_out["cumulative_rows"] == list( + map(lambda x: x["cumulative_rows"], records) + ) -# assert expected_out["cartesian"] == records[0]["cartesian"] + assert expected_out["cartesian"] == records[0]["cartesian"] -# @mark_with_dialects_excluding() -# def test_blocking_records_accuracy(test_helpers, dialect): -# from numpy import nan +@mark_with_dialects_excluding() +def test_blocking_records_accuracy(test_helpers, dialect): + from numpy import nan -# helper = test_helpers[dialect] -# Linker = helper.Linker + helper = test_helpers[dialect] + db_api = helper.DatabaseAPI(**helper.db_api_args()) -# # resolve an issue w/ pyspark nulls + # resolve an issue w/ pyspark nulls -# df = [ -# {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, -# {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, -# {"unique_id": 3, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, -# {"unique_id": 4, "first_name": "Kim", "surname": "Lee", "dob": None}, -# ] -# df = pd.DataFrame(df).fillna(nan).replace([nan], [None]) - -# settings = { -# "link_type": "dedupe_only", -# "blocking_rules_to_generate_predictions": [ -# "l.first_name = r.first_name", -# ], -# "comparisons": [], -# "retain_matching_columns": True, -# "retain_intermediate_calculation_columns": True, -# "em_convergence": 0.001, -# "max_iterations": 20, -# } - -# linker_settings = Linker(df, settings, **helper.extra_linker_args()) -# n = len(df) -# # dedupe only -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [1], -# "cumulative_rows": [1], -# "cartesian": n * (n - 1) / 2, -# }, -# blocking_rules=None, -# ) + df = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, + {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, + {"unique_id": 3, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + {"unique_id": 4, "first_name": "Kim", "surname": "Lee", "dob": None}, + ] + df = pd.DataFrame(df).fillna(nan).replace([nan], [None]) -# # dedupe only with additional brs -# blocking_rules = [ -# "l.surname = r.surname", -# "l.first_name = r.first_name", -# ] + comparison_count_args = { + "table_or_tables": df, + "blocking_rule_creators": [block_on("first_name")], + "link_type": "dedupe_only", + "db_api": db_api, + "unique_id_column_name": "unique_id", + } -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [1, 1], -# "cumulative_rows": [1, 2], -# "cartesian": n * (n - 1) / 2, -# }, -# blocking_rules=blocking_rules, -# ) + n = len(df) + # dedupe only + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [1], + "cumulative_rows": [1], + "cartesian": n * (n - 1) / 2, + }, + ) -# blocking_rules = [ -# block_on("first_name").get_blocking_rule(dialect), -# block_on("first_name", "surname").get_blocking_rule(dialect), -# "l.dob = r.dob", -# ] + # dedupe only with additional brs + blocking_rules = [ + "l.surname = r.surname", + "l.first_name = r.first_name", + ] + + comparison_count_args["blocking_rule_creators"] = blocking_rules + + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [1, 1], + "cumulative_rows": [1, 2], + "cartesian": n * (n - 1) / 2, + }, + ) + + blocking_rules = [ + block_on("first_name"), + block_on("first_name", "surname"), + "l.dob = r.dob", + ] + + comparison_count_args["blocking_rule_creators"] = blocking_rules + + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [1, 0, 1], + "cumulative_rows": [1, 1, 2], + "cartesian": n * (n - 1) / 2, + }, + ) -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [1, 0, 1], -# "cumulative_rows": [1, 1, 2], -# "cartesian": n * (n - 1) / 2, -# }, -# blocking_rules=blocking_rules, -# ) # # link and dedupe + link only # df_l = [ From 701390a9d49d169a8cf760a818f4ff27e36ee8df Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 08:36:05 +0100 Subject: [PATCH 18/59] fix test total comparison count --- splink/analyse_blocking.py | 2 +- tests/test_total_comparison_count.py | 45 +++++++++++----------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index bfd3cec011..f319f3aa49 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -505,7 +505,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( db_api: DatabaseAPI, unique_id_column_name: str, max_rows_limit: int = 1e9, - source_dataset_column_name: str = None, + source_dataset_column_name: str = "source_dataset", ): splink_df_dict = db_api.register_multiple_tables(table_or_tables) diff --git a/tests/test_total_comparison_count.py b/tests/test_total_comparison_count.py index ba9678f73e..83381e5fe5 100644 --- a/tests/test_total_comparison_count.py +++ b/tests/test_total_comparison_count.py @@ -1,11 +1,8 @@ import pandas as pd import pytest -from splink.analyse_blocking import ( - number_of_comparisons_generated_by_blocking_rule_post_filters_sql, -) +from splink.analyse_blocking import count_comparisons_from_blocking_rule from splink.duckdb.database_api import DuckDBAPI -from splink.linker import Linker from splink.misc import calculate_cartesian from splink.pipeline import CTEPipeline from splink.vertically_concatenate import vertically_concatenate_sql @@ -85,48 +82,42 @@ def make_dummy_frame(row_count): ) dfs = list(map(make_dummy_frame, frame_sizes)) - settings = {"link_type": link_type} db_api = DuckDBAPI() - linker = Linker(dfs, settings, database_api=db_api) - pipeline = CTEPipeline() + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=dfs, + blocking_rule="1=1", + link_type=link_type, + db_api=db_api, + unique_id_column_name="unique_id", + ) - sds_name = linker._settings_obj.column_info_settings.source_dataset_column_name + res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] + # compare with count from each frame + pipeline = CTEPipeline() sql = vertically_concatenate_sql( - input_tables=linker._input_tables_dict, - salting_required=linker._settings_obj.salting_required, - source_dataset_column_name=sds_name, + input_tables=db_api.register_multiple_tables(dfs), + salting_required=False, + source_dataset_column_name="source_dataset", ) - pipeline.enqueue_sql(sql, "__splink__df_concat") - df_concat = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline) - - pipeline = CTEPipeline([df_concat]) - # calculate full number of comparisons - full_count_sql = number_of_comparisons_generated_by_blocking_rule_post_filters_sql( - linker, "1=1" - ) - pipeline.enqueue_sql(full_count_sql, "__splink__analyse_blocking_rule") - res_df = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline) - res = res_df.as_record_dict()[0] - # compare with count from each frame sql = f""" select count(*) as count from __splink__df_concat {group_by} order by count desc """ - pipeline = CTEPipeline([df_concat]) + pipeline.enqueue_sql(sql, "__splink__cartesian_product") - cartesian_count = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline) + cartesian_count = db_api.sql_pipeline_to_splink_dataframe(pipeline) row_count_df = cartesian_count.as_record_dict() cartesian_count.drop_table_from_database_and_remove_from_cache() # check this is what we expect from input assert frame_sizes == [frame["count"] for frame in row_count_df] - computed_value_count = calculate_cartesian(row_count_df, link_type) + computed_value_count = int(calculate_cartesian(row_count_df, link_type)) - assert computed_value_count == res["count_of_pairwise_comparisons_generated"] + assert computed_value_count == res From c6ae3ce95fd0040cd99bbe085dc113f9fbf2e34c Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 08:37:22 +0100 Subject: [PATCH 19/59] more fixes to test analyse blocking --- tests/test_analyse_blocking.py | 196 ++++++++++++++++----------------- 1 file changed, 97 insertions(+), 99 deletions(-) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 70af87672b..7cb9c767d4 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -195,124 +195,122 @@ def test_blocking_records_accuracy(test_helpers, dialect): }, ) + # link and dedupe + link only + df_l = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, + {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, + ] -# # link and dedupe + link only -# df_l = [ -# {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, -# {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, -# ] - -# df_l = pd.DataFrame(df_l) + df_l = pd.DataFrame(df_l) -# df_r = [ -# {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, -# {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, -# ] + df_r = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, + ] -# df_r = pd.DataFrame(df_r).fillna(nan).replace([nan], [None]) + df_r = pd.DataFrame(df_r).fillna(nan).replace([nan], [None]) -# blocking_rules = [ -# "l.surname = r.surname", # 2l:2r, -# Or( -# block_on("first_name"), CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)") -# ).get_blocking_rule(dialect), # 1r:1r, 1l:2l, 1l:2r -# "l.surname = r.surname", -# ] + blocking_rules = [ + "l.surname = r.surname", # 2l:2r, + Or( + block_on("first_name"), CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)") + ), # 1r:1r, 1l:2l, 1l:2r + "l.surname = r.surname", + ] -# settings = {"link_type": "link_and_dedupe"} -# linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [1, 3, 0], -# "cumulative_rows": [1, 4, 4], -# "cartesian": 1 + 1 + 4, # within, within, between -# }, -# blocking_rules=blocking_rules, -# ) + comparison_count_args = { + "table_or_tables": [df_l, df_r], + "link_type": "link_and_dedupe", + "db_api": db_api, + "unique_id_column_name": "unique_id", + "blocking_rule_creators": blocking_rules, + } -# blocking_rules = [ -# "l.surname = r.surname", # 2l:2r, -# Or( -# block_on("first_name"), -# CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)"), -# ).get_blocking_rule(dialect), # 1l:1r, 1l:2r -# "l.surname = r.surname", -# ] + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [1, 3, 0], + "cumulative_rows": [1, 4, 4], + "cartesian": 1 + 1 + 4, # within, within, between + }, + ) -# settings = {"link_type": "link_only"} -# linker_settings = Linker([df_l, df_r], settings, **helper.extra_linker_args()) -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [1, 2, 0], -# "cumulative_rows": [1, 3, 3], -# "cartesian": 4, -# }, -# blocking_rules=blocking_rules, -# ) + blocking_rules = [ + "l.surname = r.surname", # 2l:2r, + Or( + block_on("first_name"), + CustomRule("substr(l.dob,1,4) = substr(r.dob,1,4)"), + ), # 1l:1r, 1l:2r + "l.surname = r.surname", + ] -# # link and dedupe -# df_1 = [ -# {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, -# {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, -# ] + comparison_count_args["link_type"] = "link_only" + comparison_count_args["blocking_rule_creators"] = blocking_rules -# df_1 = pd.DataFrame(df_l) + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [1, 2, 0], + "cumulative_rows": [1, 3, 3], + "cartesian": 4, + }, + ) -# df_2 = [ -# {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, -# {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, -# ] + # link and dedupe + df_1 = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, + {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, + ] -# df_2 = pd.DataFrame(df_2).fillna(nan).replace([nan], [None]) + df_1 = pd.DataFrame(df_l) -# df_3 = [ -# {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, -# ] + df_2 = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + {"unique_id": 2, "first_name": "Kim", "surname": "Lee", "dob": None}, + ] -# df_3 = pd.DataFrame(df_3) + df_2 = pd.DataFrame(df_2).fillna(nan).replace([nan], [None]) -# settings = {"link_type": "link_and_dedupe"} -# blocking_rules = [ -# "l.surname = r.surname", -# "l.first_name = r.first_name", -# ] + df_3 = [ + {"unique_id": 1, "first_name": "Tom", "surname": "Ray", "dob": "1980-03-22"}, + ] -# linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [2, 2], -# "cumulative_rows": [2, 4], -# "cartesian": 5 * 4 / 2, -# }, -# blocking_rules=blocking_rules, -# ) + df_3 = pd.DataFrame(df_3) -# settings = {"link_type": "link_only"} -# blocking_rules = [ -# "l.surname = r.surname", -# "l.first_name = r.first_name", -# ] + comparison_count_args = { + "table_or_tables": [df_1, df_2, df_3], + "link_type": "link_and_dedupe", + "db_api": db_api, + "unique_id_column_name": "unique_id", + "blocking_rule_creators": [ + block_on("surname"), + block_on("first_name"), + ], + } -# linker_settings = Linker([df_1, df_2, df_3], settings, **helper.extra_linker_args()) -# validate_blocking_output( -# linker_settings, -# expected_out={ -# "row_count": [2, 2], -# "cumulative_rows": [2, 4], -# "cartesian": 8, -# }, -# blocking_rules=blocking_rules, -# ) + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [2, 2], + "cumulative_rows": [2, 4], + "cartesian": 5 * 4 / 2, + }, + ) -# blocking_rules_df = cumulative_comparisons_generated_by_blocking_rules( -# linker_settings, blocking_rules=blocking_rules, return_dataframe=True -# ) + comparison_count_args["link_type"] = "link_only" + comparison_count_args["blocking_rule_creators"] = [ + block_on("surname"), + block_on("first_name"), + ] -# expected_row_count = pd.DataFrame({"row_count": [2, 2]}) -# assert (blocking_rules_df["row_count"] == expected_row_count["row_count"]).all() + validate_blocking_output( + comparison_count_args, + expected_out={ + "row_count": [2, 2], + "cumulative_rows": [2, 4], + "cartesian": 8, + }, + ) # def test_analyse_blocking_fast_methodology(): From 55a1db45120b9c3a5d2ce674adac0843bf51fbcc Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 08:58:40 +0100 Subject: [PATCH 20/59] test edge cases --- tests/test_analyse_blocking.py | 282 ++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 130 deletions(-) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 7cb9c767d4..bdb3553f32 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -313,149 +313,171 @@ def test_blocking_records_accuracy(test_helpers, dialect): ) -# def test_analyse_blocking_fast_methodology(): -# df_1 = pd.DataFrame( -# [ -# {"unique_id": 1, "first_name": "John", "surname": "Smith"}, -# {"unique_id": 2, "first_name": "John", "surname": "Smith"}, -# {"unique_id": 3, "first_name": "John", "surname": "Jones"}, -# {"unique_id": 4, "first_name": "Mary", "surname": "Jones"}, -# {"unique_id": 5, "first_name": "Brian", "surname": "Taylor"}, -# ] -# ) +def test_analyse_blocking_fast_methodology(): + df_1 = pd.DataFrame( + [ + {"unique_id": 1, "first_name": "John", "surname": "Smith"}, + {"unique_id": 2, "first_name": "John", "surname": "Smith"}, + {"unique_id": 3, "first_name": "John", "surname": "Jones"}, + {"unique_id": 4, "first_name": "Mary", "surname": "Jones"}, + {"unique_id": 5, "first_name": "Brian", "surname": "Taylor"}, + ] + ) -# df_2 = pd.DataFrame( -# [ -# {"unique_id": 1, "first_name": "John", "surname": "Smith"}, -# {"unique_id": 2, "first_name": "John", "surname": "Smith"}, -# {"unique_id": 3, "first_name": "John", "surname": "Jones"}, -# ] -# ) -# settings = {"link_type": "dedupe_only"} -# db_api = DuckDBAPI() + df_2 = pd.DataFrame( + [ + {"unique_id": 1, "first_name": "John", "surname": "Smith"}, + {"unique_id": 2, "first_name": "John", "surname": "Smith"}, + {"unique_id": 3, "first_name": "John", "surname": "Jones"}, + ] + ) -# linker = Linker(df_1, settings, database_api=db_api) + db_api = DuckDBAPI() -# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( -# "1=1", -# ) -# assert res == 5 * 5 + args = { + "table_or_tables": df_1, + "link_type": "dedupe_only", + "db_api": db_api, + "unique_id_column_name": "unique_id", + "compute_post_filter_count": False, + } -# settings = {"link_type": "dedupe_only"} -# db_api = DuckDBAPI() + args["blocking_rule"] = "1=1" -# linker = Linker(df_1, settings, database_api=db_api) + res_dict = count_comparisons_from_blocking_rule(**args) -# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( -# "l.first_name = r.first_name OR l.surname = r.surname", -# ) -# assert res == 5 * 5 + res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] -# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( -# "l.first_name = r.first_name AND levenshtein(l.surname, r.surname) <2", -# ) -# assert res == 3 * 3 + 1 * 1 + 1 * 1 + assert res == 5 * 5 -# settings = {"link_type": "link_and_dedupe"} -# db_api = DuckDBAPI() + args["blocking_rule"] = "l.first_name = r.first_name OR l.surname = r.surname" + res_dict = count_comparisons_from_blocking_rule(**args) + res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] + assert res == 5 * 5 -# linker = Linker([df_1, df_2], settings, database_api=db_api) + # res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( + # "l.first_name = r.first_name AND levenshtein(l.surname, r.surname) <2", + # ) + # assert res == 3 * 3 + 1 * 1 + 1 * 1 -# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( -# "l.first_name = r.first_name" -# ) -# assert res == 6 * 6 + 1 * 1 + 1 * 1 + args["blocking_rule"] = """l.first_name = r.first_name + AND levenshtein(l.surname, r.surname) <2""" + res_dict = count_comparisons_from_blocking_rule(**args) + res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] + assert res == 3 * 3 + 1 * 1 + 1 * 1 -# settings = {"link_type": "link_only"} -# db_api = DuckDBAPI() + args["table_or_tables"] = [df_1, df_2] + args["link_type"] = "link_and_dedupe" + args["blocking_rule"] = block_on("first_name") -# linker = Linker([df_1, df_2], settings, database_api=db_api) + res_dict = count_comparisons_from_blocking_rule(**args) + res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] -# res = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions( -# "l.first_name = r.first_name" -# ) -# assert res == 3 * 3 - -# # Test a series of blocking rules with different edge cases. -# # Assert that the naive methodology gives the same result as the new methodlogy - -# df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") - -# blocking_rules = [ -# "l.first_name = r.first_name", -# "l.first_name = r.first_name AND l.surname = r.surname", -# "substr(l.first_name,2,3) = substr(r.first_name,3,4)", -# "substr(l.first_name,1,1) = substr(r.surname,1,1) and l.dob = r.dob", -# "l.first_name = r.first_name and levenshtein(l.dob, r.dob) > -1", -# "l.dob = r.dob and substr(l.first_name,2,3) = substr(r.first_name,3,4)", -# ] - -# sql_template = """ -# select count(*) -# from df as l -# inner join df as r -# on {blocking_rule} -# """ - -# results = {} -# for br in blocking_rules: -# sql = sql_template.format(blocking_rule=br) -# res = duckdb.sql(sql).df() -# results[br] = {"count_from_join_dedupe_only": res.iloc[0][0]} - -# db_api = DuckDBAPI() - -# linker = Linker(df, {"link_type": "dedupe_only"}, database_api=db_api) -# for br in blocking_rules: -# c = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) -# results[br]["count_from_efficient_fn_dedupe_only"] = c - -# for br in blocking_rules: -# assert ( -# results[br]["count_from_join_dedupe_only"] -# == results[br]["count_from_efficient_fn_dedupe_only"] -# ) - -# # Link only -# df_l = df.iloc[::2].copy() # even-indexed rows (starting from 0) -# df_r = df.iloc[1::2].copy() # odd-indexed rows (starting from 1) - -# sql_template = """ -# select count(*) -# from df_l as l -# inner join df_r as r -# on {blocking_rule} -# """ - -# results = {} -# for br in blocking_rules: -# sql = sql_template.format(blocking_rule=br) -# res = duckdb.sql(sql).df() -# results[br] = {"count_from_join_link_only": res.iloc[0][0]} - -# db_api = DuckDBAPI() - -# linker = Linker([df_l, df_r], {"link_type": "link_only"}, database_api=db_api) -# for br in blocking_rules: -# c = linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) -# results[br]["count_from_efficient_fn_link_only"] = c - -# for br in blocking_rules: -# assert ( -# results[br]["count_from_join_link_only"] -# == results[br]["count_from_efficient_fn_link_only"] -# ) - - -# def test_blocking_rule_accepts_different_dialects(): -# br = "l.first_name = r.first_name" -# br = BlockingRule(br, sqlglot_dialect="spark") -# assert br._equi_join_conditions == [("first_name", "first_name")] - -# br = "l.`hi THERE` = r.`hi THERE`" -# br = BlockingRule(br, sqlglot_dialect="spark") - -# assert br._equi_join_conditions == [("`hi THERE`", "`hi THERE`")] + assert res == 6 * 6 + 1 * 1 + 1 * 1 + + args["link_type"] = "link_only" + args["blocking_rule"] = block_on("first_name") + + res_dict = count_comparisons_from_blocking_rule(**args) + res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] + assert res == 3 * 3 + + +def test_analyse_blocking_fast_methodology_edge_cases(): + # Test a series of blocking rules with different edge cases. + # Assert that the naive methodology gives the same result as the new methodlogy + + df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + + blocking_rules = [ + "l.first_name = r.first_name", + "l.first_name = r.first_name AND l.surname = r.surname", + "substr(l.first_name,2,3) = substr(r.first_name,3,4)", + "substr(l.first_name,1,1) = substr(r.surname,1,1) and l.dob = r.dob", + "l.first_name = r.first_name and levenshtein(l.dob, r.dob) > -1", + "l.dob = r.dob and substr(l.first_name,2,3) = substr(r.first_name,3,4)", + ] + + sql_template = """ + select count(*) + from df as l + inner join df as r + on {blocking_rule} + """ + + results = {} + for br in blocking_rules: + sql = sql_template.format(blocking_rule=br) + res = duckdb.sql(sql).df() + results[br] = {"count_from_join_dedupe_only": res.iloc[0][0]} + + db_api = DuckDBAPI() + + for br in blocking_rules: + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=df, + blocking_rule=br, + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) + c = res_dict["number_of_comparisons_generated_pre_filter_conditions"] + + results[br]["count_from_efficient_fn_dedupe_only"] = c + + for br in blocking_rules: + assert ( + results[br]["count_from_join_dedupe_only"] + == results[br]["count_from_efficient_fn_dedupe_only"] + ) + + # Link only + df_l = df.iloc[::2].copy() # even-indexed rows (starting from 0) + df_r = df.iloc[1::2].copy() # odd-indexed rows (starting from 1) + + sql_template = """ + select count(*) + from df_l as l + inner join df_r as r + on {blocking_rule} + """ + + results = {} + for br in blocking_rules: + sql = sql_template.format(blocking_rule=br) + res = duckdb.sql(sql).df() + results[br] = {"count_from_join_link_only": res.iloc[0][0]} + + db_api = DuckDBAPI() + + for br in blocking_rules: + res_dict = count_comparisons_from_blocking_rule( + table_or_tables=[df_l, df_r], + blocking_rule=br, + link_type="link_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) + c = res_dict["number_of_comparisons_generated_pre_filter_conditions"] + + results[br]["count_from_efficient_fn_link_only"] = c + + for br in blocking_rules: + assert ( + results[br]["count_from_join_link_only"] + == results[br]["count_from_efficient_fn_link_only"] + ) + + +def test_blocking_rule_accepts_different_dialects(): + br = "l.first_name = r.first_name" + br = BlockingRule(br, sqlglot_dialect="spark") + assert br._equi_join_conditions == [("first_name", "first_name")] + + br = "l.`hi THERE` = r.`hi THERE`" + br = BlockingRule(br, sqlglot_dialect="spark") + + assert br._equi_join_conditions == [("`hi THERE`", "`hi THERE`")] # @mark_with_dialects_excluding() From 71c9fee3c9c41e75f735914653cd78785fbdab93 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 09:20:03 +0100 Subject: [PATCH 21/59] final fixes to test analyse blocking --- tests/test_analyse_blocking.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index bdb3553f32..76e4407544 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -3,15 +3,14 @@ from splink.analyse_blocking import ( count_comparisons_from_blocking_rule, + cumulative_comparisons_to_be_scored_from_blocking_rules_chart, cumulative_comparisons_to_be_scored_from_blocking_rules_data, ) from splink.blocking import BlockingRule from splink.blocking_rule_library import CustomRule, Or, block_on from splink.duckdb.database_api import DuckDBAPI -from splink.linker import Linker -from .basic_settings import get_settings_dict -from .decorator import mark_with_dialects_excluding +from .decorator import mark_with_dialects_excluding, mark_with_dialects_including @mark_with_dialects_excluding() @@ -383,6 +382,7 @@ def test_analyse_blocking_fast_methodology(): assert res == 3 * 3 +@mark_with_dialects_including("duckdb") def test_analyse_blocking_fast_methodology_edge_cases(): # Test a series of blocking rules with different edge cases. # Assert that the naive methodology gives the same result as the new methodlogy @@ -480,21 +480,17 @@ def test_blocking_rule_accepts_different_dialects(): assert br._equi_join_conditions == [("`hi THERE`", "`hi THERE`")] -# @mark_with_dialects_excluding() -# def test_cumulative_br_funs(test_helpers, dialect): -# helper = test_helpers[dialect] -# Linker = helper.Linker - -# df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv") - -# linker = Linker(df, get_settings_dict(), **helper.extra_linker_args()) -# linker.cumulative_comparisons_from_blocking_rules_records() -# linker.cumulative_comparisons_from_blocking_rules_records( -# ["l.first_name = r.first_name", block_on("surname")] -# ) +@mark_with_dialects_excluding() +def test_chart(test_helpers, dialect): + helper = test_helpers[dialect] + db_api = helper.DatabaseAPI(**helper.db_api_args()) -# linker.cumulative_num_comparisons_from_blocking_rules_chart( -# ["l.first_name = r.first_name", block_on("surname")] -# ) + df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv") -# assert linker.count_num_comparisons_from_blocking_rule(block_on("surname")) == 3167 + cumulative_comparisons_to_be_scored_from_blocking_rules_chart( + table_or_tables=df, + blocking_rule_creators=[block_on("first_name"), "l.surname = r.surname"], + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) From 6b78f9b5c7270472b63010f8cedde38fbe887910 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 09:21:44 +0100 Subject: [PATCH 22/59] fix full example postgres --- tests/test_full_example_postgres.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py index 86db08b666..d42ffcdc1d 100644 --- a/tests/test_full_example_postgres.py +++ b/tests/test_full_example_postgres.py @@ -2,6 +2,10 @@ import pandas as pd +from splink.analyse_blocking import ( + count_comparisons_from_blocking_rule, + cumulative_comparisons_to_be_scored_from_blocking_rules_chart, +) from splink.exploratory import completeness_chart, profile_columns from splink.linker import Linker from splink.postgres.database_api import PostgresAPI @@ -23,15 +27,24 @@ def test_full_example_postgres(tmp_path, pg_engine): database_api=db_api, ) - linker.count_num_comparisons_from_blocking_rule( - 'l.first_name = r.first_name and l."surname" = r."surname"' + count_comparisons_from_blocking_rule( + table_or_tables=df, + blocking_rule='l.first_name = r.first_name and l."SUR name" = r."SUR name"', + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", ) - linker.cumulative_num_comparisons_from_blocking_rules_chart( - [ + + cumulative_comparisons_to_be_scored_from_blocking_rules_chart( + table_or_tables=df, + blocking_rule_creators=[ "l.first_name = r.first_name", "l.surname = r.surname", "l.city = r.city", - ] + ], + link_type="dedupe_only", + db_api=db_api, + unique_id_column_name="unique_id", ) profile_columns( From 4468b9db5c644933244dfe7bd1bf67e9c3d5b797 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 09:35:01 +0100 Subject: [PATCH 23/59] fix postgres test --- tests/test_full_example_postgres.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py index d42ffcdc1d..b4d67a45a9 100644 --- a/tests/test_full_example_postgres.py +++ b/tests/test_full_example_postgres.py @@ -29,7 +29,7 @@ def test_full_example_postgres(tmp_path, pg_engine): count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule='l.first_name = r.first_name and l."SUR name" = r."SUR name"', + blocking_rule='l.first_name = r.first_name and l."surname" = r."surname"', link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", From 004ed96fac18ebb76b8947e6165ac4490bd2470d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 09:48:16 +0100 Subject: [PATCH 24/59] fix case of no matches returned --- splink/analyse_blocking.py | 40 ++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index f319f3aa49..16a33fdc02 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -350,17 +350,24 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( "blocking_rule": [br.blocking_rule_sql for br in blocking_rules], } ) + if len(result_df) > 0: + complete_df = all_rules_df.merge(result_df, on="match_key", how="left").fillna( + {"row_count": 0} + ) - complete_df = all_rules_df.merge(result_df, on="match_key", how="left").fillna( - {"row_count": 0} - ) + complete_df["cumulative_rows"] = complete_df["row_count"].cumsum().astype(int) + complete_df["start"] = complete_df["cumulative_rows"] - complete_df["row_count"] + complete_df["cartesian"] = cartesian_count - complete_df["cumulative_rows"] = complete_df["row_count"].cumsum().astype(int) - complete_df["start"] = complete_df["cumulative_rows"] - complete_df["row_count"] - complete_df["cartesian"] = cartesian_count + for c in ["row_count", "cumulative_rows", "cartesian", "start"]: + complete_df[c] = complete_df[c].astype(int) - for c in ["row_count", "cumulative_rows", "cartesian", "start"]: - complete_df[c] = complete_df[c].astype(int) + else: + complete_df = all_rules_df.copy() + complete_df["row_count"] = 0 + complete_df["cumulative_rows"] = 0 + complete_df["cartesian"] = cartesian_count + complete_df["start"] = 0 [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables] @@ -372,21 +379,8 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( "match_key", "start", ] - if len(complete_df) > 0: - return complete_df[col_order] - else: - return pd.DataFrame( - [ - { - "blocking_rule": "No blocking rules", - "row_count": 0, - "cumulative_rows": 0, - "cartesian": cartesian_count, - "match_key": 0, - "start": 0, - } - ] - ) + + return complete_df[col_order] def _count_comparisons_generated_from_blocking_rule( From 2343c189782c91ccd87493bf4962da5867faee1b Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 09:56:25 +0100 Subject: [PATCH 25/59] fix autofixable --- splink/blocking.py | 1 - tests/test_full_example_duckdb.py | 1 - 2 files changed, 2 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 406cedf1dd..afbc3f02b4 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -21,7 +21,6 @@ # https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports if TYPE_CHECKING: - from .linker import Linker from .settings import LinkTypeLiteralType diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index 55beda28d3..2b33dc7337 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -17,7 +17,6 @@ from .decorator import mark_with_dialects_including from .linker_utils import ( _test_table_registration, - _test_write_functionality, register_roc_data, ) From 631818940d938f3950fa8a52f6d44726b730adac Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 10:03:23 +0100 Subject: [PATCH 26/59] formatting --- splink/blocking.py | 5 ++++- tests/test_full_example_duckdb.py | 23 ++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index afbc3f02b4..002fe4470a 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -155,7 +155,10 @@ def create_blocked_pairs_sql( on ({self.blocking_rule_sql}) {where_condition} - {self.exclude_pairs_generated_by_all_preceding_rules_sql(source_dataset_input_column, unique_id_input_column)} + {self.exclude_pairs_generated_by_all_preceding_rules_sql( + source_dataset_input_column, + unique_id_input_column) + } """ return sql diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index 2b33dc7337..f47394d253 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -17,6 +17,7 @@ from .decorator import mark_with_dialects_including from .linker_utils import ( _test_table_registration, + _test_write_functionality, register_roc_data, ) @@ -119,21 +120,21 @@ def test_full_example_duckdb(tmp_path): "cluster": 10000, } - # linker.find_matches_to_new_records( - # [record], blocking_rules=[], match_weight_threshold=-10000 - # ) + linker.find_matches_to_new_records( + [record], blocking_rules=[], match_weight_threshold=-10000 + ) - # # Test saving and loading - # path = os.path.join(tmp_path, "model.json") - # linker.save_model_to_json(path) + # Test saving and loading + path = os.path.join(tmp_path, "model.json") + linker.save_model_to_json(path) - # db_api = DuckDBAPI() - # linker_2 = Linker(df, settings=simple_settings, database_api=db_api) + db_api = DuckDBAPI() + linker_2 = Linker(df, settings=simple_settings, database_api=db_api) - # linker_2 = Linker(df, database_api=db_api, settings=path) + linker_2 = Linker(df, database_api=db_api, settings=path) - # # Test that writing to files works as expected - # _test_write_functionality(linker_2, pd.read_csv) + # Test that writing to files works as expected + _test_write_functionality(linker_2, pd.read_csv) # Create some dummy dataframes for the link only test From 505ad99b5fe757e7d55cd0360376ea2279c2b3ce Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 10:17:18 +0100 Subject: [PATCH 27/59] fix mypy errors --- splink/blocking.py | 66 +++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/splink/blocking.py b/splink/blocking.py index 002fe4470a..07da8e8825 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -1,14 +1,14 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, List, Optional from sqlglot import parse_one from sqlglot.expressions import Column, Expression, Identifier, Join from sqlglot.optimizer.eliminate_joins import join_condition from sqlglot.optimizer.optimizer import optimize -from .database_api import DatabaseAPI +from .database_api import DatabaseAPISubClass from .exceptions import SplinkException from .input_column import InputColumn from .misc import ensure_is_list @@ -61,11 +61,11 @@ def blocking_rule_to_obj(br: BlockingRule | dict[str, Any] | str) -> BlockingRul def combine_unique_id_input_columns( source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn -) -> Tuple[InputColumn]: - unique_id_input_columns = () +) -> List[InputColumn]: + unique_id_input_columns: List[InputColumn] = [] if source_dataset_input_column: - unique_id_input_columns += (source_dataset_input_column,) - unique_id_input_columns += (unique_id_input_column,) + unique_id_input_columns.append(source_dataset_input_column) + unique_id_input_columns.append(unique_id_input_column) return unique_id_input_columns @@ -103,7 +103,7 @@ def exclude_pairs_generated_by_this_rule_sql( self, source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, - ): + ) -> str: """A SQL string specifying how to exclude the results of THIS blocking rule from subseqent blocking statements, so that subsequent statements do not produce duplicate pairs @@ -118,7 +118,7 @@ def exclude_pairs_generated_by_all_preceding_rules_sql( self, source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, - ): + ) -> str: """A SQL string that excludes the results of ALL previous blocking rules from the pairwise comparisons generated. """ @@ -136,15 +136,15 @@ def exclude_pairs_generated_by_all_preceding_rules_sql( def create_blocked_pairs_sql( self, + *, source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, - *, - input_tablename_l, - input_tablename_r, - where_condition, - probability, - sql_select_expr, - ): + input_tablename_l: str, + input_tablename_r: str, + where_condition: str, + probability: str, + sql_select_expr: str, + ) -> str: sql = f""" select {sql_select_expr} @@ -278,12 +278,12 @@ def create_blocked_pairs_sql( *, source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, - input_tablename_l, - input_tablename_r, - where_condition, - probability, - sql_select_expr, - ): + input_tablename_l: str, + input_tablename_r: str, + where_condition: str, + probability: str, + sql_select_expr: str, + ) -> str: sqls = [] exclude_sql = self.exclude_pairs_generated_by_all_preceding_rules_sql( source_dataset_input_column, unique_id_input_column @@ -336,7 +336,7 @@ def marginal_exploded_id_pairs_table_sql( unique_id_input_column: InputColumn, br: BlockingRule, link_type: "LinkTypeLiteralType", - ): + ) -> str: """generates a table of the marginal id pairs from the exploded blocking rule i.e. pairs are only created that match this blocking rule and NOT any of the preceding blocking rules @@ -382,7 +382,7 @@ def exclude_pairs_generated_by_this_rule_sql( self, source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, - ): + ) -> str: """A SQL string specifying how to exclude the results of THIS blocking rule from subseqent blocking statements, so that subsequent statements do not produce duplicate pairs @@ -419,12 +419,12 @@ def create_blocked_pairs_sql( *, source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, - input_tablename_l, - input_tablename_r, - where_condition, - probability, - sql_select_expr, - ): + input_tablename_l: str, + input_tablename_r: str, + where_condition: str, + probability: str, + sql_select_expr: str, + ) -> str: if self.exploded_id_pair_table is None: raise ValueError( "Exploding blocking rules are not supported for the function you have" @@ -462,11 +462,11 @@ def as_dict(self): def materialise_exploded_id_tables( link_type: "LinkTypeLiteralType", blocking_rules: List[BlockingRule], - db_api: DatabaseAPI, + db_api: DatabaseAPISubClass, splink_df_dict: dict[str, SplinkDataFrame], source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, -): +) -> list[ExplodingBlockingRule]: exploding_blocking_rules = [ br for br in blocking_rules if isinstance(br, ExplodingBlockingRule) ] @@ -480,7 +480,7 @@ def materialise_exploded_id_tables( sql = vertically_concatenate_sql( splink_df_dict, salting_required=False, - source_dataset_column_name=source_dataset_input_column, + source_dataset_column_name=source_dataset_input_column.name, ) pipeline.enqueue_sql(sql, "__splink__df_concat") nodes_concat = db_api.sql_pipeline_to_splink_dataframe(pipeline) @@ -549,7 +549,7 @@ def block_using_rules_sqls( source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn, set_match_probability_to_one: bool = False, -): +) -> list[dict[str, str]]: """Use the blocking rules specified in the linker's settings object to generate a SQL statement that will create pairwise record comparions according to the blocking rule(s). From 96af2766ff28e5944f85d896c5349feb10d6dd1c Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 10:34:28 +0100 Subject: [PATCH 28/59] more mypy --- splink/analyse_blocking.py | 68 ++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 16a33fdc02..84dfd6202d 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Iterable, List, Literal, Union +from typing import Iterable, List, Literal, Optional, Union import pandas as pd import sqlglot @@ -15,7 +15,7 @@ from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_creator_utils import to_blocking_rule_creator from .charts import cumulative_blocking_rule_comparisons_generated -from .database_api import DatabaseAPI, DatabaseAPISubClass +from .database_api import DatabaseAPISubClass from .input_column import InputColumn from .misc import calculate_cartesian from .pipeline import CTEPipeline @@ -33,11 +33,11 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( input_data_dict: dict[str, "SplinkDataFrame"], - blocking_rule: Union[str, "BlockingRule"], + blocking_rule: "BlockingRule", link_type: str, db_api: DatabaseAPISubClass, unique_id_column_name: str, -) -> str: +) -> list[dict[str, str]]: input_dataframes = list(input_data_dict.values()) two_dataset_link_only = link_type == "link_only" and len(input_dataframes) == 2 @@ -85,15 +85,12 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( def _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( input_data_dict: dict[str, "SplinkDataFrame"], - blocking_rule: Union[str, "BlockingRule"], + blocking_rule: "BlockingRule", link_type: str, db_api: DatabaseAPISubClass, -): +) -> list[dict[str, str]]: input_dataframes = list(input_data_dict.values()) - if isinstance(blocking_rule, str): - blocking_rule = BlockingRule(blocking_rule, sqlglot_dialect=db_api.sql_dialect) - join_conditions = blocking_rule._equi_join_conditions two_dataset_link_only = link_type == "link_only" and len(input_dataframes) == 2 @@ -192,11 +189,12 @@ def _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( def _row_counts_per_input_table( + *, splink_df_dict: dict[str, "SplinkDataFrame"], link_type: link_type_type, - source_dataset_column_name: str, - db_api: DatabaseAPI, -): + source_dataset_column_name: Optional[str], + db_api: DatabaseAPISubClass, +) -> "SplinkDataFrame": pipeline = CTEPipeline() sql = vertically_concatenate_sql( @@ -224,12 +222,12 @@ def _row_counts_per_input_table( def _cumulative_comparisons_to_be_scored_from_blocking_rules( *, splink_df_dict: dict[str, "SplinkDataFrame"], - blocking_rules: Iterable[BlockingRule], + blocking_rules: List[BlockingRule], link_type: link_type_type, - db_api: DatabaseAPI, - max_rows_limit: int = 1e9, + db_api: DatabaseAPISubClass, + max_rows_limit: float = 1e9, unique_id_column_name: str, - source_dataset_column_name: str = None, + source_dataset_column_name: Optional[str], ) -> pd.DataFrame: unique_id_input_column = InputColumn( unique_id_column_name, sql_dialect=db_api.sql_dialect.name @@ -238,10 +236,16 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( source_dataset_input_column = None input_columns = [unique_id_input_column] else: - source_dataset_input_column = InputColumn( - source_dataset_column_name, sql_dialect=db_api.sql_dialect.name - ) - input_columns = [unique_id_input_column, source_dataset_input_column] + if source_dataset_column_name is not None: + source_dataset_input_column = InputColumn( + source_dataset_column_name, sql_dialect=db_api.sql_dialect.name + ) + input_columns = [unique_id_input_column, source_dataset_input_column] + else: + raise ValueError( + "source_dataset_column_name cannot be None " + "for link_type other than 'dedupe_only'" + ) # Check none of the blocking rules will create a vast/computationally # intractable number of comparisons @@ -271,10 +275,10 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( ) rc = _row_counts_per_input_table( - splink_df_dict, - link_type, - source_dataset_column_name, - db_api, + splink_df_dict=splink_df_dict, + link_type=link_type, + source_dataset_column_name=source_dataset_column_name, + db_api=db_api, ).as_record_dict() cartesian_count = calculate_cartesian(rc, link_type) @@ -388,9 +392,9 @@ def _count_comparisons_generated_from_blocking_rule( splink_df_dict: dict[str, "SplinkDataFrame"], blocking_rule: BlockingRule, link_type: link_type_type, - db_api: DatabaseAPI, + db_api: DatabaseAPISubClass, compute_post_filter_count: bool, - max_rows_limit: int = 1e9, + max_rows_limit: float = 1e9, unique_id_column_name: str = "unique_id", ): # TODO: if it's an exploding blocking rule, make sure we error out @@ -468,10 +472,10 @@ def count_comparisons_from_blocking_rule( table_or_tables, blocking_rule: Union[BlockingRuleCreator, str, dict], link_type: link_type_type, - db_api: DatabaseAPI, + db_api: DatabaseAPISubClass, unique_id_column_name: str, compute_post_filter_count: bool = True, - max_rows_limit: int = 1e9, + max_rows_limit: float = 1e9, ): if not isinstance(blocking_rule, BlockingRule): blocking_rule = to_blocking_rule_creator(blocking_rule).get_blocking_rule( @@ -496,9 +500,9 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( table_or_tables, blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], link_type: link_type_type, - db_api: DatabaseAPI, + db_api: DatabaseAPISubClass, unique_id_column_name: str, - max_rows_limit: int = 1e9, + max_rows_limit: float = 1e9, source_dataset_column_name: str = "source_dataset", ): splink_df_dict = db_api.register_multiple_tables(table_or_tables) @@ -528,9 +532,9 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( table_or_tables, blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], link_type: link_type_type, - db_api: DatabaseAPI, + db_api: DatabaseAPISubClass, unique_id_column_name: str, - max_rows_limit: int = 1e9, + max_rows_limit: float = 1e9, source_dataset_column_name: str = None, ): splink_df_dict = db_api.register_multiple_tables(table_or_tables) From 84c19cbc53e5253b00f72c844b90a70247d10b88 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 13:18:23 +0100 Subject: [PATCH 29/59] fix array exlode test --- splink/blocking.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/splink/blocking.py b/splink/blocking.py index 07da8e8825..5b3faf4e79 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -477,10 +477,14 @@ def materialise_exploded_id_tables( pipeline = CTEPipeline() + source_dataset_column_name = ( + source_dataset_input_column.name if source_dataset_input_column else None + ) + sql = vertically_concatenate_sql( splink_df_dict, salting_required=False, - source_dataset_column_name=source_dataset_input_column.name, + source_dataset_column_name=source_dataset_column_name, ) pipeline.enqueue_sql(sql, "__splink__df_concat") nodes_concat = db_api.sql_pipeline_to_splink_dataframe(pipeline) From 7b28cb07c198b2df92b124a2598c5d4163b9e3c2 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 13:38:42 +0100 Subject: [PATCH 30/59] fix link type options --- splink/analyse_blocking.py | 24 +++++++++++++----------- splink/blocking.py | 10 ++++++++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 84dfd6202d..b460ab2d6b 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Iterable, List, Literal, Optional, Union +from typing import Iterable, List, Optional, Union import pandas as pd import sqlglot @@ -9,8 +9,10 @@ from .blocking import ( BlockingRule, _sql_gen_where_condition, + backend_link_type_options, block_using_rules_sqls, materialise_exploded_id_tables, + user_input_link_type_options, ) from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_creator_utils import to_blocking_rule_creator @@ -28,9 +30,6 @@ logger = logging.getLogger(__name__) -link_type_type = Literal["link_only", "link_and_dedupe", "dedupe_only"] - - def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( input_data_dict: dict[str, "SplinkDataFrame"], blocking_rule: "BlockingRule", @@ -191,7 +190,7 @@ def _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( def _row_counts_per_input_table( *, splink_df_dict: dict[str, "SplinkDataFrame"], - link_type: link_type_type, + link_type: backend_link_type_options, source_dataset_column_name: Optional[str], db_api: DatabaseAPISubClass, ) -> "SplinkDataFrame": @@ -223,7 +222,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( *, splink_df_dict: dict[str, "SplinkDataFrame"], blocking_rules: List[BlockingRule], - link_type: link_type_type, + link_type: backend_link_type_options, db_api: DatabaseAPISubClass, max_rows_limit: float = 1e9, unique_id_column_name: str, @@ -310,8 +309,12 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( ) blocking_input_tablename_l = "__splink__df_concat" + blocking_input_tablename_r = "__splink__df_concat" if len(splink_df_dict) == 2 and link_type == "link_only": + link_type = "two_dataset_link_only" + + if "two_dataset_link_only" and source_dataset_column_name is not None: sqls = split_df_concat_with_tf_into_two_tables_sqls( "__splink__df_concat", source_dataset_column_name, @@ -320,7 +323,6 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( blocking_input_tablename_l = "__splink__df_concat_left" blocking_input_tablename_r = "__splink__df_concat_right" - link_type = "two_dataset_link_only" sqls = block_using_rules_sqls( input_tablename_l=blocking_input_tablename_l, @@ -391,7 +393,7 @@ def _count_comparisons_generated_from_blocking_rule( *, splink_df_dict: dict[str, "SplinkDataFrame"], blocking_rule: BlockingRule, - link_type: link_type_type, + link_type: backend_link_type_options, db_api: DatabaseAPISubClass, compute_post_filter_count: bool, max_rows_limit: float = 1e9, @@ -471,7 +473,7 @@ def count_comparisons_from_blocking_rule( *, table_or_tables, blocking_rule: Union[BlockingRuleCreator, str, dict], - link_type: link_type_type, + link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, compute_post_filter_count: bool = True, @@ -499,7 +501,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( *, table_or_tables, blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], - link_type: link_type_type, + link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, max_rows_limit: float = 1e9, @@ -531,7 +533,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( *, table_or_tables, blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], - link_type: link_type_type, + link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, max_rows_limit: float = 1e9, diff --git a/splink/blocking.py b/splink/blocking.py index 5b3faf4e79..b5589f5ba4 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, Any, List, Literal, Optional from sqlglot import parse_one from sqlglot.expressions import Column, Expression, Identifier, Join @@ -23,6 +23,12 @@ if TYPE_CHECKING: from .settings import LinkTypeLiteralType +user_input_link_type_options = Literal["link_only", "link_and_dedupe", "dedupe_only"] + +backend_link_type_options = Literal[ + "link_only", "link_and_dedupe", "dedupe_only", "two_dataset_link_only", "self_link" +] + def blocking_rule_to_obj(br: BlockingRule | dict[str, Any] | str) -> BlockingRule: if isinstance(br, BlockingRule): @@ -464,7 +470,7 @@ def materialise_exploded_id_tables( blocking_rules: List[BlockingRule], db_api: DatabaseAPISubClass, splink_df_dict: dict[str, SplinkDataFrame], - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, ) -> list[ExplodingBlockingRule]: exploding_blocking_rules = [ From 8c8589f79000271469be7256d6bf2055477d2e3e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 13:52:53 +0100 Subject: [PATCH 31/59] fix more mypy errors --- splink/analyse_blocking.py | 28 ++++++++++++++-------------- splink/blocking.py | 2 +- splink/linker.py | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index b460ab2d6b..cccf677a7c 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -16,7 +16,7 @@ ) from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_creator_utils import to_blocking_rule_creator -from .charts import cumulative_blocking_rule_comparisons_generated +from .charts import ChartReturnType, cumulative_blocking_rule_comparisons_generated from .database_api import DatabaseAPISubClass from .input_column import InputColumn from .misc import calculate_cartesian @@ -224,7 +224,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( blocking_rules: List[BlockingRule], link_type: backend_link_type_options, db_api: DatabaseAPISubClass, - max_rows_limit: float = 1e9, + max_rows_limit: int = int(1e9), unique_id_column_name: str, source_dataset_column_name: Optional[str], ) -> pd.DataFrame: @@ -263,7 +263,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( "number_of_comparisons_generated_pre_filter_conditions" ] - if count_pre_filter > max_rows_limit: + if float(count_pre_filter) > max_rows_limit: # TODO: Use a SplinkException? Want this to give a sensible message # when ocoming from estimate_probability_two_random_records_match raise ValueError( @@ -396,9 +396,9 @@ def _count_comparisons_generated_from_blocking_rule( link_type: backend_link_type_options, db_api: DatabaseAPISubClass, compute_post_filter_count: bool, - max_rows_limit: float = 1e9, + max_rows_limit: int = int(1e9), unique_id_column_name: str = "unique_id", -): +) -> dict[str, Union[int, str]]: # TODO: if it's an exploding blocking rule, make sure we error out pipeline = CTEPipeline() sqls = _count_comparisons_from_blocking_rule_pre_filter_conditions_sqls( @@ -423,7 +423,7 @@ def add_l_r(sql, table_name): for i, j in blocking_rule._equi_join_conditions ] - equi_join_conditions = " AND ".join(equi_join_conditions) + equi_join_conditions_joined = " AND ".join(equi_join_conditions) filter_conditions = blocking_rule._filter_conditions if filter_conditions == "TRUE": @@ -434,7 +434,7 @@ def add_l_r(sql, table_name): "number_of_comparisons_generated_pre_filter_conditions": pre_filter_total, "number_of_comparisons_to_be_scored_post_filter_conditions": "not computed", "filter_conditions_identified": filter_conditions, - "equi_join_conditions_identified": equi_join_conditions, + "equi_join_conditions_identified": equi_join_conditions_joined, } if pre_filter_total < max_rows_limit: @@ -465,7 +465,7 @@ def add_l_r(sql, table_name): "number_of_comparisons_generated_pre_filter_conditions": pre_filter_total, "number_of_comparisons_to_be_scored_post_filter_conditions": post_filter_total, "filter_conditions_identified": filter_conditions, - "equi_join_conditions_identified": equi_join_conditions, + "equi_join_conditions_identified": equi_join_conditions_joined, } @@ -477,8 +477,8 @@ def count_comparisons_from_blocking_rule( db_api: DatabaseAPISubClass, unique_id_column_name: str, compute_post_filter_count: bool = True, - max_rows_limit: float = 1e9, -): + max_rows_limit: int = int(1e9), +) -> dict[str, Union[int, str]]: if not isinstance(blocking_rule, BlockingRule): blocking_rule = to_blocking_rule_creator(blocking_rule).get_blocking_rule( db_api.sql_dialect.name @@ -504,9 +504,9 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, - max_rows_limit: float = 1e9, + max_rows_limit: int = int(1e9), source_dataset_column_name: str = "source_dataset", -): +) -> pd.DataFrame: splink_df_dict = db_api.register_multiple_tables(table_or_tables) blocking_rules: List[BlockingRule] = [] @@ -536,9 +536,9 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, - max_rows_limit: float = 1e9, + max_rows_limit: int = int(1e9), source_dataset_column_name: str = None, -): +) -> ChartReturnType: splink_df_dict = db_api.register_multiple_tables(table_or_tables) blocking_rules: List[BlockingRule] = [] diff --git a/splink/blocking.py b/splink/blocking.py index b5589f5ba4..bf7b29ba36 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -556,7 +556,7 @@ def block_using_rules_sqls( blocking_rules: List[BlockingRule], link_type: "LinkTypeLiteralType", columns_to_select_sql: str, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, set_match_probability_to_one: bool = False, ) -> list[dict[str, str]]: diff --git a/splink/linker.py b/splink/linker.py index ed45b57b88..3222ce1b2a 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2803,7 +2803,7 @@ def estimate_probability_two_random_records_match( self, deterministic_matching_rules: List[Union[str, BlockingRuleCreator]], recall: float, - max_rows_limit: int = 1e9, + max_rows_limit: int = int(1e9), ): """Estimate the model parameter `probability_two_random_records_match` using a direct estimation approach. From 9d2f82b19e11a3144a4cf1ab53e38d8434b82de2 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 14:05:28 +0100 Subject: [PATCH 32/59] fix more mypy stuff --- splink/analyse_blocking.py | 25 ++++++++++++------------- splink/blocking_rule_creator.py | 5 ++++- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index cccf677a7c..6478aea969 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Iterable, List, Optional, Union +from typing import Iterable, List, Optional, Sequence, Union import pandas as pd import sqlglot @@ -14,10 +14,10 @@ materialise_exploded_id_tables, user_input_link_type_options, ) -from .blocking_rule_creator import BlockingRuleCreator +from .blocking_rule_creator import acceptable_br_creator_types from .blocking_rule_creator_utils import to_blocking_rule_creator from .charts import ChartReturnType, cumulative_blocking_rule_comparisons_generated -from .database_api import DatabaseAPISubClass +from .database_api import AcceptableInputTableType, DatabaseAPISubClass from .input_column import InputColumn from .misc import calculate_cartesian from .pipeline import CTEPipeline @@ -471,18 +471,17 @@ def add_l_r(sql, table_name): def count_comparisons_from_blocking_rule( *, - table_or_tables, - blocking_rule: Union[BlockingRuleCreator, str, dict], + table_or_tables: Sequence[AcceptableInputTableType], + blocking_rule_creator: acceptable_br_creator_types, link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), ) -> dict[str, Union[int, str]]: - if not isinstance(blocking_rule, BlockingRule): - blocking_rule = to_blocking_rule_creator(blocking_rule).get_blocking_rule( - db_api.sql_dialect.name - ) + blocking_rule = to_blocking_rule_creator(blocking_rule_creator).get_blocking_rule( + db_api.sql_dialect.name + ) splink_df_dict = db_api.register_multiple_tables(table_or_tables) @@ -499,8 +498,8 @@ def count_comparisons_from_blocking_rule( def cumulative_comparisons_to_be_scored_from_blocking_rules_data( *, - table_or_tables, - blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], + table_or_tables: Sequence[AcceptableInputTableType], + blocking_rule_creators: Iterable[acceptable_br_creator_types], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, @@ -531,8 +530,8 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( *, - table_or_tables, - blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, dict]], + table_or_tables: Sequence[AcceptableInputTableType], + blocking_rule_creators: Iterable[acceptable_br_creator_types], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, diff --git a/splink/blocking_rule_creator.py b/splink/blocking_rule_creator.py index 99716d1f92..8abfc980d7 100644 --- a/splink/blocking_rule_creator.py +++ b/splink/blocking_rule_creator.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any, final +from typing import Any, Union, final from .blocking import BlockingRule, blocking_rule_to_obj from .dialects import SplinkDialect @@ -51,3 +51,6 @@ def create_blocking_rule_dict(self, sql_dialect_str: str) -> dict[str, Any]: @final def get_blocking_rule(self, sql_dialect_str: str) -> BlockingRule: return blocking_rule_to_obj(self.create_blocking_rule_dict(sql_dialect_str)) + + +acceptable_br_creator_types = Union[BlockingRuleCreator, str, dict[str, Any]] From 59bdc386acf840a38682dea11f13213f0450530a Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 14:09:01 +0100 Subject: [PATCH 33/59] rename arg --- splink/analyse_blocking.py | 8 ++++---- tests/test_analyse_blocking.py | 24 +++++++++++++----------- tests/test_full_example_duckdb.py | 2 +- tests/test_full_example_postgres.py | 2 +- tests/test_total_comparison_count.py | 2 +- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 6478aea969..78849589ec 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -479,15 +479,15 @@ def count_comparisons_from_blocking_rule( compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), ) -> dict[str, Union[int, str]]: - blocking_rule = to_blocking_rule_creator(blocking_rule_creator).get_blocking_rule( - db_api.sql_dialect.name - ) + blocking_rule_creator = to_blocking_rule_creator( + blocking_rule_creator + ).get_blocking_rule(db_api.sql_dialect.name) splink_df_dict = db_api.register_multiple_tables(table_or_tables) return _count_comparisons_generated_from_blocking_rule( splink_df_dict=splink_df_dict, - blocking_rule=blocking_rule, + blocking_rule=blocking_rule_creator, link_type=link_type, db_api=db_api, compute_post_filter_count=compute_post_filter_count, diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 76e4407544..ed84224429 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -49,13 +49,13 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): } res_dict = count_comparisons_from_blocking_rule( - table_or_tables=df_1, blocking_rule="1=1", **args + table_or_tables=df_1, blocking_rule_creator="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 / 2 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=df_1, blocking_rule=block_on("first_name"), **args + table_or_tables=df_1, blocking_rule_creator=block_on("first_name"), **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] @@ -63,33 +63,35 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): args["link_type"] = "link_only" res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule="1=1", **args + table_or_tables=[df_1, df_2], blocking_rule_creator="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule=block_on("surname"), **args + table_or_tables=[df_1, df_2], blocking_rule_creator=block_on("surname"), **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule=block_on("first_name"), **args + table_or_tables=[df_1, df_2], + blocking_rule_creator=block_on("first_name"), + **args, ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 3 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2, df_3], blocking_rule="1=1", **args + table_or_tables=[df_1, df_2, df_3], blocking_rule_creator="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 + 4 * 2 + 2 * 3 args["link_type"] = "link_and_dedupe" res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule="1=1", **args + table_or_tables=[df_1, df_2], blocking_rule_creator="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] expected = 4 * 3 + (4 * 3 / 2) + (3 * 2 / 2) @@ -97,14 +99,14 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): rule = "l.first_name = r.first_name and l.surname = r.surname" res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule=rule, **args + table_or_tables=[df_1, df_2], blocking_rule_creator=rule, **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 rule = block_on("first_name", "surname") res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule=rule, **args + table_or_tables=[df_1, df_2], blocking_rule_creator=rule, **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 @@ -416,7 +418,7 @@ def test_analyse_blocking_fast_methodology_edge_cases(): for br in blocking_rules: res_dict = count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule=br, + blocking_rule_creator=br, link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", @@ -453,7 +455,7 @@ def test_analyse_blocking_fast_methodology_edge_cases(): for br in blocking_rules: res_dict = count_comparisons_from_blocking_rule( table_or_tables=[df_l, df_r], - blocking_rule=br, + blocking_rule_creator=br, link_type="link_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index f47394d253..4cfd7b2fe8 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -44,7 +44,7 @@ def test_full_example_duckdb(tmp_path): count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule='l.first_name = r.first_name and l."SUR name" = r."SUR name"', + blocking_rule_creator='l.first_name = r.first_name and l."SUR name" = r."SUR name"', # noqa: E501 link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py index b4d67a45a9..2eb927cdf4 100644 --- a/tests/test_full_example_postgres.py +++ b/tests/test_full_example_postgres.py @@ -29,7 +29,7 @@ def test_full_example_postgres(tmp_path, pg_engine): count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule='l.first_name = r.first_name and l."surname" = r."surname"', + blocking_rule_creator='l.first_name = r.first_name and l."surname" = r."surname"', # noqa: E501 link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_total_comparison_count.py b/tests/test_total_comparison_count.py index 83381e5fe5..4cf637450d 100644 --- a/tests/test_total_comparison_count.py +++ b/tests/test_total_comparison_count.py @@ -87,7 +87,7 @@ def make_dummy_frame(row_count): res_dict = count_comparisons_from_blocking_rule( table_or_tables=dfs, - blocking_rule="1=1", + blocking_rule_creator="1=1", link_type=link_type, db_api=db_api, unique_id_column_name="unique_id", From bacf875742f708f88d04a8740d683610374a8f2b Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 14:22:01 +0100 Subject: [PATCH 34/59] fix tests --- splink/analyse_blocking.py | 2 +- splink/blocking_rule_creator.py | 4 ++-- splink/linker.py | 11 ++++------- tests/test_analyse_blocking.py | 12 +++++++----- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 78849589ec..3a5d542c3d 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -314,7 +314,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( if len(splink_df_dict) == 2 and link_type == "link_only": link_type = "two_dataset_link_only" - if "two_dataset_link_only" and source_dataset_column_name is not None: + if link_type == "two_dataset_link_only" and source_dataset_column_name is not None: sqls = split_df_concat_with_tf_into_two_tables_sqls( "__splink__df_concat", source_dataset_column_name, diff --git a/splink/blocking_rule_creator.py b/splink/blocking_rule_creator.py index 8abfc980d7..d9528d7da2 100644 --- a/splink/blocking_rule_creator.py +++ b/splink/blocking_rule_creator.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any, Union, final +from typing import Any, Dict, Union, final from .blocking import BlockingRule, blocking_rule_to_obj from .dialects import SplinkDialect @@ -53,4 +53,4 @@ def get_blocking_rule(self, sql_dialect_str: str) -> BlockingRule: return blocking_rule_to_obj(self.create_blocking_rule_dict(sql_dialect_str)) -acceptable_br_creator_types = Union[BlockingRuleCreator, str, dict[str, Any]] +acceptable_br_creator_types = Union[BlockingRuleCreator, str, Dict[str, Any]] diff --git a/splink/linker.py b/splink/linker.py index 3222ce1b2a..43261e9d0d 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2827,14 +2827,11 @@ def estimate_probability_two_random_records_match( ) from None blocking_rules: List[BlockingRule] = [] for br in deterministic_matching_rules: - if isinstance(br, BlockingRule): - blocking_rules.append(br) - else: - blocking_rules.append( - to_blocking_rule_creator(br).get_blocking_rule( - self.db_api.sql_dialect.name - ) + blocking_rules.append( + to_blocking_rule_creator(br).get_blocking_rule( + self.db_api.sql_dialect.name ) + ) pd_df = _cumulative_comparisons_to_be_scored_from_blocking_rules( splink_df_dict=self._input_tables_dict, diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index ed84224429..8bd576815e 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -343,7 +343,7 @@ def test_analyse_blocking_fast_methodology(): "compute_post_filter_count": False, } - args["blocking_rule"] = "1=1" + args["blocking_rule_creator"] = "1=1" res_dict = count_comparisons_from_blocking_rule(**args) @@ -351,7 +351,9 @@ def test_analyse_blocking_fast_methodology(): assert res == 5 * 5 - args["blocking_rule"] = "l.first_name = r.first_name OR l.surname = r.surname" + args["blocking_rule_creator"] = ( + "l.first_name = r.first_name OR l.surname = r.surname" + ) res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] assert res == 5 * 5 @@ -361,7 +363,7 @@ def test_analyse_blocking_fast_methodology(): # ) # assert res == 3 * 3 + 1 * 1 + 1 * 1 - args["blocking_rule"] = """l.first_name = r.first_name + args["blocking_rule_creator"] = """l.first_name = r.first_name AND levenshtein(l.surname, r.surname) <2""" res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] @@ -369,7 +371,7 @@ def test_analyse_blocking_fast_methodology(): args["table_or_tables"] = [df_1, df_2] args["link_type"] = "link_and_dedupe" - args["blocking_rule"] = block_on("first_name") + args["blocking_rule_creator"] = block_on("first_name") res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] @@ -377,7 +379,7 @@ def test_analyse_blocking_fast_methodology(): assert res == 6 * 6 + 1 * 1 + 1 * 1 args["link_type"] = "link_only" - args["blocking_rule"] = block_on("first_name") + args["blocking_rule_creator"] = block_on("first_name") res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] From 16c8c2c4e61761b8866b0a19c3052c239920f414 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 15:27:22 +0100 Subject: [PATCH 35/59] fix compare two records --- splink/linker.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/splink/linker.py b/splink/linker.py index 43261e9d0d..931b9665fa 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -1052,7 +1052,7 @@ def predict( self, threshold_match_probability: float = None, threshold_match_weight: float = None, - materialise_after_computing_term_frequencies=True, + materialise_after_computing_term_frequencies: bool = True, ) -> SplinkDataFrame: """Create a dataframe of scored pairwise comparisons using the parameters of the linkage model. @@ -1333,12 +1333,6 @@ def compare_two_records( Returns: SplinkDataFrame: Pairwise comparison with scored prediction """ - original_blocking_rules = ( - self._settings_obj._blocking_rules_to_generate_predictions - ) - original_link_type = self._settings_obj._link_type - - self._settings_obj._blocking_rules_to_generate_predictions = [] cache = self._intermediate_table_cache @@ -1385,11 +1379,15 @@ def compare_two_records( pipeline.enqueue_sql(sql_join_tf, "__splink__compare_two_records_right_with_tf") sqls = block_using_rules_sqls( - self, input_tablename_l="__splink__compare_two_records_left_with_tf", input_tablename_r="__splink__compare_two_records_right_with_tf", blocking_rules=[BlockingRule("1=1")], link_type=self._settings_obj._link_type, + columns_to_select_sql=", ".join( + self._settings_obj._columns_to_select_for_blocking + ), + source_dataset_input_column=self._settings_obj.column_info_settings.source_dataset_input_column, + unique_id_input_column=self._settings_obj.column_info_settings.unique_id_input_column, ) pipeline.enqueue_list_of_sqls(sqls) @@ -1408,11 +1406,6 @@ def compare_two_records( pipeline, use_cache=False ) - self._settings_obj._blocking_rules_to_generate_predictions = ( - original_blocking_rules - ) - self._settings_obj._link_type = original_link_type - return predictions def _self_link(self) -> SplinkDataFrame: From a9529439f1060aca9ad5c13aa34ec9bc10123f7f Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 15:30:37 +0100 Subject: [PATCH 36/59] fix linker mypy --- splink/linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/linker.py b/splink/linker.py index 931b9665fa..e805a9e8a3 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2797,7 +2797,7 @@ def estimate_probability_two_random_records_match( deterministic_matching_rules: List[Union[str, BlockingRuleCreator]], recall: float, max_rows_limit: int = int(1e9), - ): + ) -> None: """Estimate the model parameter `probability_two_random_records_match` using a direct estimation approach. From 1ec069501e5c8c209dffe10b2e69b6df43e951b6 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 15:37:57 +0100 Subject: [PATCH 37/59] all mypy except auto blocking --- splink/analyse_blocking.py | 4 ++-- splink/blocking.py | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py index 3a5d542c3d..2b1d49e569 100644 --- a/splink/analyse_blocking.py +++ b/splink/analyse_blocking.py @@ -479,7 +479,7 @@ def count_comparisons_from_blocking_rule( compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), ) -> dict[str, Union[int, str]]: - blocking_rule_creator = to_blocking_rule_creator( + blocking_rule_creator_as_creator = to_blocking_rule_creator( blocking_rule_creator ).get_blocking_rule(db_api.sql_dialect.name) @@ -487,7 +487,7 @@ def count_comparisons_from_blocking_rule( return _count_comparisons_generated_from_blocking_rule( splink_df_dict=splink_df_dict, - blocking_rule=blocking_rule_creator, + blocking_rule=blocking_rule_creator_as_creator, link_type=link_type, db_api=db_api, compute_post_filter_count=compute_post_filter_count, diff --git a/splink/blocking.py b/splink/blocking.py index bf7b29ba36..4dfbb359cf 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -66,7 +66,8 @@ def blocking_rule_to_obj(br: BlockingRule | dict[str, Any] | str) -> BlockingRul def combine_unique_id_input_columns( - source_dataset_input_column: InputColumn, unique_id_input_column: InputColumn + source_dataset_input_column: Optional[InputColumn], + unique_id_input_column: InputColumn, ) -> List[InputColumn]: unique_id_input_columns: List[InputColumn] = [] if source_dataset_input_column: @@ -107,7 +108,7 @@ def add_preceding_rules(self, rules): def exclude_pairs_generated_by_this_rule_sql( self, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, ) -> str: """A SQL string specifying how to exclude the results @@ -122,7 +123,7 @@ def exclude_pairs_generated_by_this_rule_sql( def exclude_pairs_generated_by_all_preceding_rules_sql( self, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, ) -> str: """A SQL string that excludes the results of ALL previous blocking rules from @@ -143,7 +144,7 @@ def exclude_pairs_generated_by_all_preceding_rules_sql( def create_blocked_pairs_sql( self, *, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, input_tablename_l: str, input_tablename_r: str, @@ -282,7 +283,7 @@ def _salting_condition(self, salt): def create_blocked_pairs_sql( self, *, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, input_tablename_l: str, input_tablename_r: str, @@ -338,7 +339,7 @@ def __init__( def marginal_exploded_id_pairs_table_sql( self, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, br: BlockingRule, link_type: "LinkTypeLiteralType", @@ -386,7 +387,7 @@ def drop_materialised_id_pairs_dataframe(self): def exclude_pairs_generated_by_this_rule_sql( self, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, ) -> str: """A SQL string specifying how to exclude the results @@ -423,7 +424,7 @@ def exclude_pairs_generated_by_this_rule_sql( def create_blocked_pairs_sql( self, *, - source_dataset_input_column: InputColumn, + source_dataset_input_column: Optional[InputColumn], unique_id_input_column: InputColumn, input_tablename_l: str, input_tablename_r: str, @@ -519,7 +520,10 @@ def materialise_exploded_id_tables( table_name = f"{base_name}_mk_{br.match_key}" sql = br.marginal_exploded_id_pairs_table_sql( - source_dataset_input_column, unique_id_input_column, br, link_type + source_dataset_input_column=source_dataset_input_column, + unique_id_input_column=unique_id_input_column, + br=br, + link_type=link_type, ) pipeline.enqueue_sql(sql, table_name) From efa643570f9e1ad98d9c553a590a036bda0b3df8 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 15:54:10 +0100 Subject: [PATCH 38/59] final mypy errors --- ..._with_comparison_counts_below_threshold.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 6ac7043143..256cd0e190 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -7,11 +7,12 @@ import pandas as pd from .analyse_blocking import ( - count_comparisons_from_blocking_rule, + _count_comparisons_generated_from_blocking_rule, ) from .blocking import BlockingRule from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_library import CustomRule, block_on +from .database_api import DatabaseAPISubClass from .input_column import InputColumn if TYPE_CHECKING: @@ -69,23 +70,23 @@ def _generate_combinations( def _generate_blocking_rule( - linker: "Linker", cols_as_string: List[str] + db_api: DatabaseAPISubClass, cols_as_string: List[str] ) -> BlockingRule: """Generate a Splink blocking rule given a list of column names which are provided as as string""" if len(cols_as_string) == 0: - br: BlockingRuleCreator = CustomRule("1=1", linker._sql_dialect) + br: BlockingRuleCreator = CustomRule("1=1", db_api.sql_dialect.name) else: br = block_on(*cols_as_string) - return br.get_blocking_rule(linker._sql_dialect) + return br.get_blocking_rule(db_api.sql_dialect.name) def _search_tree_for_blocking_rules_below_threshold_count( linker: "Linker", all_columns: List[str], - threshold: float, + threshold: int, current_combination: List[str] = None, already_visited: Set[frozenset[str]] = None, results: List[Dict[str, str]] = None, @@ -156,12 +157,20 @@ def _search_tree_for_blocking_rules_below_threshold_count( if len(current_combination) == len(all_columns): return results # All fields included, meaning we're at a leaf so exit recursion - br = _generate_blocking_rule(linker, current_combination) + br = _generate_blocking_rule(linker.db_api, current_combination) - comparison_count = count_comparisons_from_blocking_rule(linker, br) + comparison_count = _count_comparisons_generated_from_blocking_rule( + splink_df_dict=linker._input_tables_dict, + blocking_rule=br, + link_type=linker._settings_obj._link_type, + db_api=linker.db_api, + compute_post_filter_count=False, + )["number_of_comparisons_generated_pre_filter_conditions"] already_visited.add(frozenset(current_combination)) + # int just to satisfy mypy + comparison_count = int(comparison_count) if comparison_count > threshold: # Generate all valid combinations and continue the search combinations = _generate_combinations( From 8a1ff8883ef9494945a4c2cf50e2da4b10aaa267 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 16:04:28 +0100 Subject: [PATCH 39/59] rename module to blocking analysis --- splink/{analyse_blocking.py => blocking_analysis.py} | 0 splink/find_brs_with_comparison_counts_below_threshold.py | 4 ++-- splink/linker.py | 2 +- tests/test_analyse_blocking.py | 4 ++-- tests/test_full_example_deterministic_link.py | 2 +- tests/test_full_example_duckdb.py | 2 +- tests/test_full_example_postgres.py | 2 +- tests/test_new_db_api.py | 2 +- tests/test_total_comparison_count.py | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) rename splink/{analyse_blocking.py => blocking_analysis.py} (100%) diff --git a/splink/analyse_blocking.py b/splink/blocking_analysis.py similarity index 100% rename from splink/analyse_blocking.py rename to splink/blocking_analysis.py diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 256cd0e190..f6c2fe51b6 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -6,10 +6,10 @@ import pandas as pd -from .analyse_blocking import ( +from .blocking import BlockingRule +from .blocking_analysis import ( _count_comparisons_generated_from_blocking_rule, ) -from .blocking import BlockingRule from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_library import CustomRule, block_on from .database_api import DatabaseAPISubClass diff --git a/splink/linker.py b/splink/linker.py index e805a9e8a3..6ed76c4c63 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -14,7 +14,6 @@ truth_space_table_from_labels_column, truth_space_table_from_labels_table, ) -from .analyse_blocking import _cumulative_comparisons_to_be_scored_from_blocking_rules from .blocking import ( BlockingRule, SaltedBlockingRule, @@ -22,6 +21,7 @@ blocking_rule_to_obj, materialise_exploded_id_tables, ) +from .blocking_analysis import _cumulative_comparisons_to_be_scored_from_blocking_rules from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_creator_utils import to_blocking_rule_creator from .cache_dict_with_logging import CacheDictWithLogging diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 8bd576815e..2cb355e8e1 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -1,12 +1,12 @@ import duckdb import pandas as pd -from splink.analyse_blocking import ( +from splink.blocking import BlockingRule +from splink.blocking_analysis import ( count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, cumulative_comparisons_to_be_scored_from_blocking_rules_data, ) -from splink.blocking import BlockingRule from splink.blocking_rule_library import CustomRule, Or, block_on from splink.duckdb.database_api import DuckDBAPI diff --git a/tests/test_full_example_deterministic_link.py b/tests/test_full_example_deterministic_link.py index d72ea935ad..b817a7b559 100644 --- a/tests/test_full_example_deterministic_link.py +++ b/tests/test_full_example_deterministic_link.py @@ -2,7 +2,7 @@ import pandas as pd -from splink.analyse_blocking import ( +from splink.blocking_analysis import ( cumulative_comparisons_to_be_scored_from_blocking_rules_chart, ) from splink.linker import Linker diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index 4cfd7b2fe8..d40e7e5f45 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -8,7 +8,7 @@ import splink.comparison_level_library as cll import splink.comparison_library as cl -from splink.analyse_blocking import count_comparisons_from_blocking_rule +from splink.blocking_analysis import count_comparisons_from_blocking_rule from splink.duckdb.database_api import DuckDBAPI from splink.exploratory import completeness_chart, profile_columns from splink.linker import Linker diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py index 2eb927cdf4..86f7cfc75a 100644 --- a/tests/test_full_example_postgres.py +++ b/tests/test_full_example_postgres.py @@ -2,7 +2,7 @@ import pandas as pd -from splink.analyse_blocking import ( +from splink.blocking_analysis import ( count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, ) diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py index 034ac018b2..09e0e022b1 100644 --- a/tests/test_new_db_api.py +++ b/tests/test_new_db_api.py @@ -3,7 +3,7 @@ import splink.comparison_level_library as cll import splink.comparison_library as cl from splink import block_on -from splink.analyse_blocking import ( +from splink.blocking_analysis import ( cumulative_comparisons_to_be_scored_from_blocking_rules_chart, ) from splink.exploratory import profile_columns diff --git a/tests/test_total_comparison_count.py b/tests/test_total_comparison_count.py index 4cf637450d..9513e5edcb 100644 --- a/tests/test_total_comparison_count.py +++ b/tests/test_total_comparison_count.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from splink.analyse_blocking import count_comparisons_from_blocking_rule +from splink.blocking_analysis import count_comparisons_from_blocking_rule from splink.duckdb.database_api import DuckDBAPI from splink.misc import calculate_cartesian from splink.pipeline import CTEPipeline From 2a896e688fe5d36dc2a21421b0bb4b25e1b07d73 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 16:20:37 +0100 Subject: [PATCH 40/59] move files --- ..._with_comparison_counts_below_threshold.py | 6 ++-- splink/{ => internals}/blocking_analysis.py | 32 +++++++++---------- splink/linker.py | 4 ++- 3 files changed, 22 insertions(+), 20 deletions(-) rename splink/{ => internals}/blocking_analysis.py (95%) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index f6c2fe51b6..f1ad8c1751 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -7,13 +7,13 @@ import pandas as pd from .blocking import BlockingRule -from .blocking_analysis import ( - _count_comparisons_generated_from_blocking_rule, -) from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_library import CustomRule, block_on from .database_api import DatabaseAPISubClass from .input_column import InputColumn +from .internals.blocking_analysis import ( + _count_comparisons_generated_from_blocking_rule, +) if TYPE_CHECKING: from .linker import Linker diff --git a/splink/blocking_analysis.py b/splink/internals/blocking_analysis.py similarity index 95% rename from splink/blocking_analysis.py rename to splink/internals/blocking_analysis.py index 2b1d49e569..025ccafe6c 100644 --- a/splink/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -1,12 +1,12 @@ from __future__ import annotations import logging -from typing import Iterable, List, Optional, Sequence, Union +from typing import Any, Dict, Iterable, List, Optional, Sequence, Union import pandas as pd import sqlglot -from .blocking import ( +from ..blocking import ( BlockingRule, _sql_gen_where_condition, backend_link_type_options, @@ -14,15 +14,15 @@ materialise_exploded_id_tables, user_input_link_type_options, ) -from .blocking_rule_creator import acceptable_br_creator_types -from .blocking_rule_creator_utils import to_blocking_rule_creator -from .charts import ChartReturnType, cumulative_blocking_rule_comparisons_generated -from .database_api import AcceptableInputTableType, DatabaseAPISubClass -from .input_column import InputColumn -from .misc import calculate_cartesian -from .pipeline import CTEPipeline -from .splink_dataframe import SplinkDataFrame -from .vertically_concatenate import ( +from ..blocking_rule_creator import BlockingRuleCreator +from ..blocking_rule_creator_utils import to_blocking_rule_creator +from ..charts import ChartReturnType, cumulative_blocking_rule_comparisons_generated +from ..database_api import AcceptableInputTableType, DatabaseAPISubClass +from ..input_column import InputColumn +from ..misc import calculate_cartesian +from ..pipeline import CTEPipeline +from ..splink_dataframe import SplinkDataFrame +from ..vertically_concatenate import ( split_df_concat_with_tf_into_two_tables_sqls, vertically_concatenate_sql, ) @@ -472,7 +472,7 @@ def add_l_r(sql, table_name): def count_comparisons_from_blocking_rule( *, table_or_tables: Sequence[AcceptableInputTableType], - blocking_rule_creator: acceptable_br_creator_types, + blocking_rule_creator: Union[BlockingRuleCreator, str, Dict[str, Any]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, @@ -499,12 +499,12 @@ def count_comparisons_from_blocking_rule( def cumulative_comparisons_to_be_scored_from_blocking_rules_data( *, table_or_tables: Sequence[AcceptableInputTableType], - blocking_rule_creators: Iterable[acceptable_br_creator_types], + blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, max_rows_limit: int = int(1e9), - source_dataset_column_name: str = "source_dataset", + source_dataset_column_name: Optional[str] = None, ) -> pd.DataFrame: splink_df_dict = db_api.register_multiple_tables(table_or_tables) @@ -531,12 +531,12 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( *, table_or_tables: Sequence[AcceptableInputTableType], - blocking_rule_creators: Iterable[acceptable_br_creator_types], + blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, max_rows_limit: int = int(1e9), - source_dataset_column_name: str = None, + source_dataset_column_name: Optional[str] = None, ) -> ChartReturnType: splink_df_dict = db_api.register_multiple_tables(table_or_tables) diff --git a/splink/linker.py b/splink/linker.py index 6ed76c4c63..0c589cb26f 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -21,7 +21,6 @@ blocking_rule_to_obj, materialise_exploded_id_tables, ) -from .blocking_analysis import _cumulative_comparisons_to_be_scored_from_blocking_rules from .blocking_rule_creator import BlockingRuleCreator from .blocking_rule_creator_utils import to_blocking_rule_creator from .cache_dict_with_logging import CacheDictWithLogging @@ -63,6 +62,9 @@ _size_density_centralisation_sql, ) from .input_column import InputColumn +from .internals.blocking_analysis import ( + _cumulative_comparisons_to_be_scored_from_blocking_rules, +) from .labelling_tool import ( generate_labelling_tool_comparisons, render_labelling_tool_html, From afe1d9c58c906717f220bc398ab51bfad320b03f Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 16:23:45 +0100 Subject: [PATCH 41/59] aliases for public api --- splink/blocking_analysis.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 splink/blocking_analysis.py diff --git a/splink/blocking_analysis.py b/splink/blocking_analysis.py new file mode 100644 index 0000000000..62a5f93448 --- /dev/null +++ b/splink/blocking_analysis.py @@ -0,0 +1,11 @@ +from .internals.blocking_analysis import ( + count_comparisons_from_blocking_rule, + cumulative_comparisons_to_be_scored_from_blocking_rules_chart, + cumulative_comparisons_to_be_scored_from_blocking_rules_data, +) + +__all__ = [ + "count_comparisons_from_blocking_rule", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart", + "cumulative_comparisons_to_be_scored_from_blocking_rules_data", +] From b288849c032244da0970c5c457632ee88e8c6722 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 16:33:23 +0100 Subject: [PATCH 42/59] update blocking notebook --- docs/demos/tutorials/03_Blocking.ipynb | 136 ++++++++++++++----------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/docs/demos/tutorials/03_Blocking.ipynb b/docs/demos/tutorials/03_Blocking.ipynb index 068db68b6c..6f9fee7bb2 100644 --- a/docs/demos/tutorials/03_Blocking.ipynb +++ b/docs/demos/tutorials/03_Blocking.ipynb @@ -124,10 +124,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:09:02.338892Z", - "iopub.status.busy": "2024-03-27T15:09:02.338599Z", - "iopub.status.idle": "2024-03-27T15:09:02.343761Z", - "shell.execute_reply": "2024-03-27T15:09:02.343144Z" + "iopub.execute_input": "2024-05-15T15:33:06.396495Z", + "iopub.status.busy": "2024-05-15T15:33:06.396182Z", + "iopub.status.idle": "2024-05-15T15:33:06.401561Z", + "shell.execute_reply": "2024-05-15T15:33:06.400904Z" } }, "outputs": [], @@ -141,10 +141,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:09:02.347806Z", - "iopub.status.busy": "2024-03-27T15:09:02.347494Z", - "iopub.status.idle": "2024-03-27T15:09:04.113194Z", - "shell.execute_reply": "2024-03-27T15:09:04.112488Z" + "iopub.execute_input": "2024-05-15T15:33:06.405358Z", + "iopub.status.busy": "2024-05-15T15:33:06.405054Z", + "iopub.status.idle": "2024-05-15T15:33:08.329315Z", + "shell.execute_reply": "2024-05-15T15:33:08.328603Z" }, "tags": [] }, @@ -172,10 +172,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:09:04.117601Z", - "iopub.status.busy": "2024-03-27T15:09:04.117271Z", - "iopub.status.idle": "2024-03-27T15:09:04.364556Z", - "shell.execute_reply": "2024-03-27T15:09:04.363921Z" + "iopub.execute_input": "2024-05-15T15:33:08.333777Z", + "iopub.status.busy": "2024-05-15T15:33:08.333405Z", + "iopub.status.idle": "2024-05-15T15:33:08.644698Z", + "shell.execute_reply": "2024-05-15T15:33:08.643999Z" }, "tags": [] }, @@ -184,35 +184,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "473 comparisons generated by blocking_rule_1\n", - "1,638 comparisons generated by blocking_rule_2\n", - "682 comparisons generated by blocking_rule_3\n", - "315 comparisons generated by blocking_rule_4\n" + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 1632, 'number_of_comparisons_to_be_scored_post_filter_conditions': 473, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'SUBSTR(l.first_name, 1, 1) = SUBSTR(r.first_name, 1, 1) AND l.\"surname\" = r.\"surname\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 4095, 'number_of_comparisons_to_be_scored_post_filter_conditions': 1638, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"surname\" = r.\"surname\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 2153, 'number_of_comparisons_to_be_scored_post_filter_conditions': 682, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"email\" = r.\"email\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 1304, 'number_of_comparisons_to_be_scored_post_filter_conditions': 315, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"city\" = r.\"city\" AND l.\"first_name\" = r.\"first_name\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 4827, 'number_of_comparisons_to_be_scored_post_filter_conditions': 372, 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2', 'equi_join_conditions_identified': 'l.first_name = r.first_name'}\n" ] } ], "source": [ - "db_api = DuckDBAPI()\n", - "\n", - "# TODO: Want to be able to do this without creating a linker\n", - "settings = SettingsCreator(link_type=\"dedupe_only\")\n", - "linker = Linker(df, settings, database_api=db_api)\n", - "\n", - "blocking_rule_1 = block_on(\"substr(first_name, 1,1)\", \"surname\")\n", - "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_1)\n", - "print(f\"{count:,.0f} comparisons generated by blocking_rule_1\")\n", + "from splink.blocking_analysis import count_comparisons_from_blocking_rule\n", "\n", - "blocking_rule_2 = block_on(\"surname\")\n", - "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_2)\n", - "print(f\"{count:,.0f} comparisons generated by blocking_rule_2\")\n", - "\n", - "blocking_rule_3 = block_on(\"email\")\n", - "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_3)\n", - "print(f\"{count:,.0f} comparisons generated by blocking_rule_3\")\n", - "\n", - "blocking_rule_4 = block_on(\"city\", \"first_name\")\n", - "count = linker.count_num_comparisons_from_blocking_rule(blocking_rule_4)\n", - "print(f\"{count:,.0f} comparisons generated by blocking_rule_4\")" + "db_api = DuckDBAPI()\n", + "blocking_rules_for_analysis = [\n", + " block_on(\"substr(first_name, 1,1)\", \"surname\"),\n", + " block_on(\"surname\"),\n", + " block_on(\"email\"),\n", + " block_on(\"city\", \"first_name\"),\n", + " \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\",\n", + "]\n", + "\n", + "\n", + "for br in blocking_rules_for_analysis:\n", + " counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=df,\n", + " blocking_rule_creator=br,\n", + " link_type=\"dedupe_only\",\n", + " db_api=db_api,\n", + " unique_id_column_name=\"unique_id\",\n", + " )\n", + " print(\"---\")\n", + " print(counts)" ] }, { @@ -242,10 +249,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:09:04.367989Z", - "iopub.status.busy": "2024-03-27T15:09:04.367740Z", - "iopub.status.idle": "2024-03-27T15:09:04.488103Z", - "shell.execute_reply": "2024-03-27T15:09:04.487447Z" + "iopub.execute_input": "2024-05-15T15:33:08.687383Z", + "iopub.status.busy": "2024-05-15T15:33:08.687063Z", + "iopub.status.idle": "2024-05-15T15:33:08.916238Z", + "shell.execute_reply": "2024-05-15T15:33:08.915508Z" }, "tags": [] }, @@ -255,23 +262,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -330,8 +337,17 @@ } ], "source": [ - "blocking_rules = [blocking_rule_1, blocking_rule_2, blocking_rule_3]\n", - "linker.cumulative_num_comparisons_from_blocking_rules_chart(blocking_rules)" + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rule_creators=blocking_rules_for_analysis,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + " unique_id_column_name=\"unique_id\",\n", + ")" ] }, { @@ -358,10 +374,10 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:09:04.491866Z", - "iopub.status.busy": "2024-03-27T15:09:04.491599Z", - "iopub.status.idle": "2024-03-27T15:09:04.698712Z", - "shell.execute_reply": "2024-03-27T15:09:04.697690Z" + "iopub.execute_input": "2024-05-15T15:33:08.919869Z", + "iopub.status.busy": "2024-05-15T15:33:08.919593Z", + "iopub.status.idle": "2024-05-15T15:33:09.103241Z", + "shell.execute_reply": "2024-05-15T15:33:09.102685Z" }, "tags": [] }, @@ -371,23 +387,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ From 5eb1f5d7f73e2a77157c8f0cf0e0ee17efb6829a Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 16:39:29 +0100 Subject: [PATCH 43/59] tests pass again --- tests/test_analyse_blocking.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 2cb355e8e1..b6b760063c 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -197,6 +197,8 @@ def test_blocking_records_accuracy(test_helpers, dialect): ) # link and dedupe + link only + comparison_count_args["source_dataset_column_name"] = "source_dataset" + df_l = [ {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"}, {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"}, @@ -225,6 +227,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): "db_api": db_api, "unique_id_column_name": "unique_id", "blocking_rule_creators": blocking_rules, + "source_dataset_column_name": "source_dataset", } validate_blocking_output( @@ -287,6 +290,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): block_on("surname"), block_on("first_name"), ], + "source_dataset_column_name": "source_dataset", } validate_blocking_output( From 75bbe66791cc6aa656682f136cf97e754c19f612 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 16:45:27 +0100 Subject: [PATCH 44/59] deterministic dedupe example --- .../duckdb/deterministic_dedupe.ipynb | 383 +++++++++--------- 1 file changed, 197 insertions(+), 186 deletions(-) diff --git a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb index 315a349141..e47bbfec51 100644 --- a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb +++ b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb @@ -28,10 +28,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:09.320146Z", - "iopub.status.busy": "2024-03-27T15:12:09.319803Z", - "iopub.status.idle": "2024-03-27T15:12:09.339855Z", - "shell.execute_reply": "2024-03-27T15:12:09.338824Z" + "iopub.execute_input": "2024-05-15T15:43:50.508953Z", + "iopub.status.busy": "2024-05-15T15:43:50.508620Z", + "iopub.status.idle": "2024-05-15T15:43:50.514416Z", + "shell.execute_reply": "2024-05-15T15:43:50.513604Z" } }, "outputs": [], @@ -45,10 +45,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:09.344009Z", - "iopub.status.busy": "2024-03-27T15:12:09.343689Z", - "iopub.status.idle": "2024-03-27T15:12:11.300032Z", - "shell.execute_reply": "2024-03-27T15:12:11.299171Z" + "iopub.execute_input": "2024-05-15T15:43:50.519532Z", + "iopub.status.busy": "2024-05-15T15:43:50.519159Z", + "iopub.status.idle": "2024-05-15T15:43:53.171104Z", + "shell.execute_reply": "2024-05-15T15:43:53.170070Z" } }, "outputs": [ @@ -191,6 +191,7 @@ ], "source": [ "import pandas as pd\n", + "\n", "from splink import splink_datasets\n", "\n", "pd.options.display.max_rows = 1000\n", @@ -208,34 +209,6 @@ "For a deterministic linkage, the linkage methodology is based solely on these rules, so there is no need to define `comparisons` nor any other parameters required for model training in a probabilistic model.\n" ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:12:11.304361Z", - "iopub.status.busy": "2024-03-27T15:12:11.303971Z", - "iopub.status.idle": "2024-03-27T15:12:11.689500Z", - "shell.execute_reply": "2024-03-27T15:12:11.688634Z" - } - }, - "outputs": [], - "source": [ - "from splink import SettingsCreator, Linker, block_on, DuckDBAPI\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\", \"surname\", \"dob\"),\n", - " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", - " block_on(\"first_name\", \"dob\", \"occupation\"),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")\n", - "\n", - "linker = Linker(df, settings, database_api=DuckDBAPI())\n" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -246,13 +219,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:11.693856Z", - "iopub.status.busy": "2024-03-27T15:12:11.693452Z", - "iopub.status.idle": "2024-03-27T15:12:12.001948Z", - "shell.execute_reply": "2024-03-27T15:12:12.001186Z" + "iopub.execute_input": "2024-05-15T15:43:53.273619Z", + "iopub.status.busy": "2024-05-15T15:43:53.271060Z", + "iopub.status.idle": "2024-05-15T15:43:54.139302Z", + "shell.execute_reply": "2024-05-15T15:43:54.138451Z" } }, "outputs": [ @@ -261,23 +234,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "linker.cumulative_num_comparisons_from_blocking_rules_chart()" + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "db_api = DuckDBAPI()\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rule_creators=[\n", + " block_on(\"first_name\", \"surname\", \"dob\"),\n", + " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", + " block_on(\"first_name\", \"dob\", \"occupation\"),\n", + " ],\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + " unique_id_column_name=\"unique_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:43:54.144031Z", + "iopub.status.busy": "2024-05-15T15:43:54.143555Z", + "iopub.status.idle": "2024-05-15T15:43:54.254120Z", + "shell.execute_reply": "2024-05-15T15:43:54.252360Z" + } + }, + "outputs": [], + "source": [ + "from splink import Linker, SettingsCreator\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"dedupe_only\",\n", + " blocking_rules_to_generate_predictions=[\n", + " block_on(\"first_name\", \"surname\", \"dob\"),\n", + " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", + " block_on(\"first_name\", \"dob\", \"occupation\"),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker(df, settings, database_api=db_api)\n" ] }, { @@ -352,10 +369,10 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:12.006736Z", - "iopub.status.busy": "2024-03-27T15:12:12.006382Z", - "iopub.status.idle": "2024-03-27T15:12:12.499798Z", - "shell.execute_reply": "2024-03-27T15:12:12.499040Z" + "iopub.execute_input": "2024-05-15T15:43:54.259538Z", + "iopub.status.busy": "2024-05-15T15:43:54.258905Z", + "iopub.status.idle": "2024-05-15T15:43:54.922593Z", + "shell.execute_reply": "2024-05-15T15:43:54.921796Z" } }, "outputs": [ @@ -384,16 +401,15 @@ " unique_id_r\n", " dob_l\n", " dob_r\n", + " occupation_l\n", + " occupation_r\n", + " first_name_l\n", + " first_name_r\n", " surname_l\n", " surname_r\n", " postcode_fake_l\n", " postcode_fake_r\n", - " first_name_l\n", - " first_name_r\n", - " occupation_l\n", - " occupation_r\n", " match_key\n", - " match_probability\n", " \n", " \n", " \n", @@ -403,16 +419,15 @@ " Q55455287-2\n", " 1836-01-01\n", " 1836-01-01\n", + " None\n", + " writer\n", + " jaido\n", + " jaido\n", " morata\n", " morata\n", " ta4 2ug\n", " ta4 2uu\n", - " jaido\n", - " jaido\n", - " None\n", - " writer\n", " 0\n", - " 1.0\n", " \n", " \n", " 1\n", @@ -420,16 +435,15 @@ " Q55455287-3\n", " 1836-01-01\n", " 1836-01-01\n", + " None\n", + " writer\n", + " jaido\n", + " jaido\n", " morata\n", " morata\n", " ta4 2ug\n", " ta4 2uu\n", - " jaido\n", - " jaido\n", - " None\n", - " writer\n", " 0\n", - " 1.0\n", " \n", " \n", " 2\n", @@ -437,16 +451,15 @@ " Q55455287-4\n", " 1836-01-01\n", " 1836-01-01\n", + " None\n", + " writer\n", + " jaido\n", + " jaido\n", " morata\n", " morata\n", " ta4 2ug\n", " ta4 2sz\n", - " jaido\n", - " jaido\n", - " None\n", - " writer\n", " 0\n", - " 1.0\n", " \n", " \n", " 3\n", @@ -454,16 +467,15 @@ " Q55455287-5\n", " 1836-01-01\n", " 1836-01-01\n", + " None\n", + " None\n", + " jaido\n", + " jaido\n", " morata\n", " morata\n", " ta4 2ug\n", " ta4 2ug\n", - " jaido\n", - " jaido\n", - " None\n", - " None\n", " 0\n", - " 1.0\n", " \n", " \n", " 4\n", @@ -471,42 +483,41 @@ " Q55455287-6\n", " 1836-01-01\n", " 1836-01-01\n", - " morata\n", - " morata\n", - " ta4 2ug\n", " None\n", + " writer\n", " jaido\n", " jaido\n", + " morata\n", + " morata\n", + " ta4 2ug\n", " None\n", - " writer\n", " 0\n", - " 1.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " unique_id_l unique_id_r dob_l dob_r surname_l surname_r \\\n", - "0 Q55455287-12 Q55455287-2 1836-01-01 1836-01-01 morata morata \n", - "1 Q55455287-12 Q55455287-3 1836-01-01 1836-01-01 morata morata \n", - "2 Q55455287-12 Q55455287-4 1836-01-01 1836-01-01 morata morata \n", - "3 Q55455287-12 Q55455287-5 1836-01-01 1836-01-01 morata morata \n", - "4 Q55455287-12 Q55455287-6 1836-01-01 1836-01-01 morata morata \n", + " unique_id_l unique_id_r dob_l dob_r occupation_l \\\n", + "0 Q55455287-12 Q55455287-2 1836-01-01 1836-01-01 None \n", + "1 Q55455287-12 Q55455287-3 1836-01-01 1836-01-01 None \n", + "2 Q55455287-12 Q55455287-4 1836-01-01 1836-01-01 None \n", + "3 Q55455287-12 Q55455287-5 1836-01-01 1836-01-01 None \n", + "4 Q55455287-12 Q55455287-6 1836-01-01 1836-01-01 None \n", "\n", - " postcode_fake_l postcode_fake_r first_name_l first_name_r occupation_l \\\n", - "0 ta4 2ug ta4 2uu jaido jaido None \n", - "1 ta4 2ug ta4 2uu jaido jaido None \n", - "2 ta4 2ug ta4 2sz jaido jaido None \n", - "3 ta4 2ug ta4 2ug jaido jaido None \n", - "4 ta4 2ug None jaido jaido None \n", + " occupation_r first_name_l first_name_r surname_l surname_r postcode_fake_l \\\n", + "0 writer jaido jaido morata morata ta4 2ug \n", + "1 writer jaido jaido morata morata ta4 2ug \n", + "2 writer jaido jaido morata morata ta4 2ug \n", + "3 None jaido jaido morata morata ta4 2ug \n", + "4 writer jaido jaido morata morata ta4 2ug \n", "\n", - " occupation_r match_key match_probability \n", - "0 writer 0 1.0 \n", - "1 writer 0 1.0 \n", - "2 writer 0 1.0 \n", - "3 None 0 1.0 \n", - "4 writer 0 1.0 " + " postcode_fake_r match_key \n", + "0 ta4 2uu 0 \n", + "1 ta4 2uu 0 \n", + "2 ta4 2sz 0 \n", + "3 ta4 2ug 0 \n", + "4 None 0 " ] }, "execution_count": 5, @@ -534,10 +545,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:12.504846Z", - "iopub.status.busy": "2024-03-27T15:12:12.504476Z", - "iopub.status.idle": "2024-03-27T15:12:13.239528Z", - "shell.execute_reply": "2024-03-27T15:12:13.237906Z" + "iopub.execute_input": "2024-05-15T15:43:54.928175Z", + "iopub.status.busy": "2024-05-15T15:43:54.927807Z", + "iopub.status.idle": "2024-05-15T15:43:55.547697Z", + "shell.execute_reply": "2024-05-15T15:43:55.543024Z" } }, "outputs": [ @@ -574,10 +585,10 @@ "execution_count": 7, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:13.244396Z", - "iopub.status.busy": "2024-03-27T15:12:13.243774Z", - "iopub.status.idle": "2024-03-27T15:12:13.271453Z", - "shell.execute_reply": "2024-03-27T15:12:13.269458Z" + "iopub.execute_input": "2024-05-15T15:43:55.555934Z", + "iopub.status.busy": "2024-05-15T15:43:55.554006Z", + "iopub.status.idle": "2024-05-15T15:43:55.592918Z", + "shell.execute_reply": "2024-05-15T15:43:55.589688Z" } }, "outputs": [ @@ -621,108 +632,108 @@ " \n", " 0\n", " Q505697-1\n", - " Q505697-15\n", + " Q505697-2\n", " Q505697\n", " tom moon\n", " tom moon\n", " tom\n", " moon\n", " 1571-06-17\n", - " westminster\n", - " None\n", - " None\n", - " None\n", - " 0.792742\n", + " london\n", + " wc2n 4es\n", + " male\n", + " writer\n", + " 0.758496\n", " \n", " \n", " 1\n", - " Q8017455-1\n", - " Q8017455-12\n", - " Q8017455\n", - " william reeve\n", - " william reeve\n", - " william\n", - " reeve\n", - " 1815-03-16\n", - " thurrock\n", - " rm16 6de\n", + " Q84884813-1\n", + " Q84884813-12\n", + " Q84884813\n", + " fred joyce\n", + " fred joyce\n", + " fred\n", + " joyce\n", + " 1852-01-01\n", + " wigan\n", + " None\n", " male\n", " None\n", - " 0.791402\n", + " 0.268714\n", " \n", " \n", " 2\n", - " Q84562127-1\n", - " Q84562127-12\n", - " Q84562127\n", - " elsie browne\n", - " elsie browne\n", - " elsie\n", - " browne\n", - " 1853-01-01\n", - " central swindon north\n", - " sn2 8dh\n", - " male\n", + " Q20966407-1\n", + " Q21461040-14\n", + " Q21461040\n", + " tom whittle\n", + " tom whittle\n", + " tom\n", + " whittle\n", + " 1842-01-01\n", + " charlesworth\n", + " sk13 6jj\n", + " female\n", " None\n", - " 0.227872\n", + " 0.959599\n", " \n", " \n", " 3\n", - " Q1474478-1\n", - " Q654096-3\n", - " Q654096\n", - " thomas ebrill\n", - " thomas ebrill\n", - " thomas\n", - " ebrill\n", - " 1750-01-01\n", - " bolton\n", - " bl7 9rj\n", - " None\n", + " Q17388167-16\n", + " Q19601605-6\n", + " Q19601605\n", + " alfred wheeler\n", + " alfred wheeler\n", + " alfred\n", + " wheeler\n", + " 1851-01-01\n", + " bath\n", + " ba2 4sy\n", " None\n", - " 0.618269\n", + " painter\n", + " 0.001826\n", " \n", " \n", " 4\n", - " Q55595689-1\n", - " Q55595689-13\n", - " Q55595689\n", - " felice leigh\n", - " felice leigh\n", - " felice\n", - " leigh\n", - " 1853-01-01\n", - " None\n", - " chy6 2jl\n", + " Q84884813-1\n", + " Q84884813-7\n", + " Q84884813\n", + " fred wayland joyce\n", + " fred joyce\n", + " fred\n", + " joyce\n", + " 1852-01-01\n", + " wigan\n", + " wn3 4nn\n", + " male\n", " None\n", - " writer\n", - " 0.943111\n", + " 0.412256\n", " \n", " \n", "\n", "" ], "text/plain": [ - " cluster_id unique_id cluster full_name first_and_surname \\\n", - "0 Q505697-1 Q505697-15 Q505697 tom moon tom moon \n", - "1 Q8017455-1 Q8017455-12 Q8017455 william reeve william reeve \n", - "2 Q84562127-1 Q84562127-12 Q84562127 elsie browne elsie browne \n", - "3 Q1474478-1 Q654096-3 Q654096 thomas ebrill thomas ebrill \n", - "4 Q55595689-1 Q55595689-13 Q55595689 felice leigh felice leigh \n", + " cluster_id unique_id cluster full_name \\\n", + "0 Q505697-1 Q505697-2 Q505697 tom moon \n", + "1 Q84884813-1 Q84884813-12 Q84884813 fred joyce \n", + "2 Q20966407-1 Q21461040-14 Q21461040 tom whittle \n", + "3 Q17388167-16 Q19601605-6 Q19601605 alfred wheeler \n", + "4 Q84884813-1 Q84884813-7 Q84884813 fred wayland joyce \n", "\n", - " first_name surname dob birth_place postcode_fake gender \\\n", - "0 tom moon 1571-06-17 westminster None None \n", - "1 william reeve 1815-03-16 thurrock rm16 6de male \n", - "2 elsie browne 1853-01-01 central swindon north sn2 8dh male \n", - "3 thomas ebrill 1750-01-01 bolton bl7 9rj None \n", - "4 felice leigh 1853-01-01 None chy6 2jl None \n", + " first_and_surname first_name surname dob birth_place \\\n", + "0 tom moon tom moon 1571-06-17 london \n", + "1 fred joyce fred joyce 1852-01-01 wigan \n", + "2 tom whittle tom whittle 1842-01-01 charlesworth \n", + "3 alfred wheeler alfred wheeler 1851-01-01 bath \n", + "4 fred joyce fred joyce 1852-01-01 wigan \n", "\n", - " occupation __splink_salt \n", - "0 None 0.792742 \n", - "1 None 0.791402 \n", - "2 None 0.227872 \n", - "3 None 0.618269 \n", - "4 writer 0.943111 " + " postcode_fake gender occupation __splink_salt \n", + "0 wc2n 4es male writer 0.758496 \n", + "1 None male None 0.268714 \n", + "2 sk13 6jj female None 0.959599 \n", + "3 ba2 4sy None painter 0.001826 \n", + "4 wn3 4nn male None 0.412256 " ] }, "execution_count": 7, @@ -747,10 +758,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:13.278623Z", - "iopub.status.busy": "2024-03-27T15:12:13.277265Z", - "iopub.status.idle": "2024-03-27T15:12:13.404047Z", - "shell.execute_reply": "2024-03-27T15:12:13.403097Z" + "iopub.execute_input": "2024-05-15T15:43:55.600959Z", + "iopub.status.busy": "2024-05-15T15:43:55.600358Z", + "iopub.status.idle": "2024-05-15T15:43:55.761150Z", + "shell.execute_reply": "2024-05-15T15:43:55.759988Z" } }, "outputs": [ @@ -769,7 +780,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 8, From ea1a264a5a16304501505638aae704a3861c15fb Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 17:08:03 +0100 Subject: [PATCH 45/59] convert more notebooks --- .../duckdb/deduplicate_50k_synthetic.ipynb | 514 ++++++------- docs/demos/examples/duckdb/febrl3.ipynb | 351 ++++----- docs/demos/examples/duckdb/febrl4.ipynb | 726 +++++++++--------- 3 files changed, 747 insertions(+), 844 deletions(-) diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb index bc07f25037..3ba8fa66e3 100644 --- a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb @@ -24,10 +24,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:08.473746Z", - "iopub.status.busy": "2024-03-27T15:11:08.473361Z", - "iopub.status.idle": "2024-03-27T15:11:08.489194Z", - "shell.execute_reply": "2024-03-27T15:11:08.488480Z" + "iopub.execute_input": "2024-05-15T16:07:03.040913Z", + "iopub.status.busy": "2024-05-15T16:07:03.040529Z", + "iopub.status.idle": "2024-05-15T16:07:03.045834Z", + "shell.execute_reply": "2024-05-15T16:07:03.045063Z" } }, "outputs": [], @@ -41,10 +41,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:08.493408Z", - "iopub.status.busy": "2024-03-27T15:11:08.493115Z", - "iopub.status.idle": "2024-03-27T15:11:11.063854Z", - "shell.execute_reply": "2024-03-27T15:11:11.063256Z" + "iopub.execute_input": "2024-05-15T16:07:03.049635Z", + "iopub.status.busy": "2024-05-15T16:07:03.049337Z", + "iopub.status.idle": "2024-05-15T16:07:04.275040Z", + "shell.execute_reply": "2024-05-15T16:07:04.274317Z" } }, "outputs": [ @@ -188,7 +188,6 @@ "source": [ "from splink import splink_datasets\n", "\n", - "\n", "df = splink_datasets.historical_50k\n", "df.head()" ] @@ -198,10 +197,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:11.067006Z", - "iopub.status.busy": "2024-03-27T15:11:11.066716Z", - "iopub.status.idle": "2024-03-27T15:11:12.501974Z", - "shell.execute_reply": "2024-03-27T15:11:12.500861Z" + "iopub.execute_input": "2024-05-15T16:07:04.316719Z", + "iopub.status.busy": "2024-05-15T16:07:04.315783Z", + "iopub.status.idle": "2024-05-15T16:07:05.112833Z", + "shell.execute_reply": "2024-05-15T16:07:05.112087Z" } }, "outputs": [ @@ -210,23 +209,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -287,7 +286,6 @@ "source": [ "from splink import DuckDBAPI\n", "from splink.exploratory import profile_columns\n", - "from splink.column_expression import ColumnExpression\n", "\n", "db_api = DuckDBAPI()\n", "profile_columns(df, db_api, column_expressions=[\"first_name\", \"substr(surname,1,2)\"])" @@ -298,10 +296,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:12.513410Z", - "iopub.status.busy": "2024-03-27T15:11:12.512754Z", - "iopub.status.idle": "2024-03-27T15:11:12.935751Z", - "shell.execute_reply": "2024-03-27T15:11:12.935068Z" + "iopub.execute_input": "2024-05-15T16:07:05.117580Z", + "iopub.status.busy": "2024-05-15T16:07:05.117224Z", + "iopub.status.idle": "2024-05-15T16:07:05.620193Z", + "shell.execute_reply": "2024-05-15T16:07:05.619557Z" } }, "outputs": [ @@ -310,23 +308,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -385,21 +383,25 @@ } ], "source": [ - "from splink import block_on, SettingsCreator, Linker\n", - "# Simple settings will be used for exploratory analysis\n", + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", "\n", - "settings = SettingsCreator(\n", - " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\", \"surname\"),\n", + "blocking_rules = [block_on(\"first_name\", \"surname\"),\n", " block_on(\"surname\", \"dob\"),\n", " block_on(\"first_name\", \"dob\"),\n", - " block_on(\"postcode_fake\", \"first_name\"),\n", - " ],\n", - ")\n", + " block_on(\"postcode_fake\", \"first_name\")]\n", + "\n", + "db_api = DuckDBAPI()\n", "\n", - "linker = Linker(df, settings, database_api=db_api)\n", - "linker.cumulative_num_comparisons_from_blocking_rules_chart()" + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rule_creators=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + " unique_id_column_name=\"unique_id\",\n", + ")" ] }, { @@ -407,26 +409,22 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:12.939736Z", - "iopub.status.busy": "2024-03-27T15:11:12.939440Z", - "iopub.status.idle": "2024-03-27T15:11:13.073908Z", - "shell.execute_reply": "2024-03-27T15:11:13.072983Z" + "iopub.execute_input": "2024-05-15T16:07:05.623477Z", + "iopub.status.busy": "2024-05-15T16:07:05.623213Z", + "iopub.status.idle": "2024-05-15T16:07:05.768956Z", + "shell.execute_reply": "2024-05-15T16:07:05.768275Z" } }, "outputs": [], "source": [ - "from splink.settings_creator import SettingsCreator\n", "import splink.comparison_library as cl\n", "import splink.comparison_template_library as ctl\n", + "from splink import Linker\n", + "from splink.settings_creator import SettingsCreator\n", "\n", "settings = SettingsCreator(\n", " link_type=\"dedupe_only\",\n", - " blocking_rules_to_generate_predictions=[\n", - " block_on(\"first_name\", \"surname\"),\n", - " block_on(\"surname\", \"dob\"),\n", - " block_on(\"first_name\", \"dob\"),\n", - " block_on(\"postcode_fake\", \"first_name\"),\n", - " ],\n", + " blocking_rules_to_generate_predictions=blocking_rules,\n", " comparisons=[\n", " ctl.NameComparison(\"first_name\").configure(term_frequency_adjustments=True),\n", " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", @@ -452,10 +450,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:13.077908Z", - "iopub.status.busy": "2024-03-27T15:11:13.077574Z", - "iopub.status.idle": "2024-03-27T15:11:13.214778Z", - "shell.execute_reply": "2024-03-27T15:11:13.214191Z" + "iopub.execute_input": "2024-05-15T16:07:05.772775Z", + "iopub.status.busy": "2024-05-15T16:07:05.772497Z", + "iopub.status.idle": "2024-05-15T16:07:06.084481Z", + "shell.execute_reply": "2024-05-15T16:07:06.083929Z" } }, "outputs": [ @@ -484,10 +482,10 @@ "execution_count": 7, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:13.217771Z", - "iopub.status.busy": "2024-03-27T15:11:13.217532Z", - "iopub.status.idle": "2024-03-27T15:11:21.898585Z", - "shell.execute_reply": "2024-03-27T15:11:21.897741Z" + "iopub.execute_input": "2024-05-15T16:07:06.087609Z", + "iopub.status.busy": "2024-05-15T16:07:06.087384Z", + "iopub.status.idle": "2024-05-15T16:07:13.105199Z", + "shell.execute_reply": "2024-05-15T16:07:13.104695Z" } }, "outputs": [ @@ -498,20 +496,6 @@ "----- Estimating u probabilities using random sampling -----\n" ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4816c47151d145b994566568e51c630c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", @@ -544,10 +528,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:21.902945Z", - "iopub.status.busy": "2024-03-27T15:11:21.902612Z", - "iopub.status.idle": "2024-03-27T15:11:25.115425Z", - "shell.execute_reply": "2024-03-27T15:11:25.114614Z" + "iopub.execute_input": "2024-05-15T16:07:13.108034Z", + "iopub.status.busy": "2024-05-15T16:07:13.107820Z", + "iopub.status.idle": "2024-05-15T16:07:16.289385Z", + "shell.execute_reply": "2024-05-15T16:07:16.288708Z" } }, "outputs": [ @@ -589,70 +573,77 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was -0.529 in probability_two_random_records_match\n" + "Iteration 1: Largest change in params was -0.533 in probability_two_random_records_match\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was -0.0315 in probability_two_random_records_match\n" + "Iteration 2: Largest change in params was -0.034 in probability_two_random_records_match\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was -0.0121 in the m_probability of birth_place, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 0.0136 in the m_probability of birth_place, level `Exact match on birth_place`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 4: Largest change in params was 0.00509 in the m_probability of birth_place, level `Exact match on birth_place`\n" + "Iteration 4: Largest change in params was -0.00579 in the m_probability of birth_place, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 5: Largest change in params was 0.00232 in the m_probability of birth_place, level `Exact match on birth_place`\n" + "Iteration 5: Largest change in params was 0.00268 in the m_probability of birth_place, level `Exact match on birth_place`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 6: Largest change in params was -0.00111 in the m_probability of birth_place, level `All other comparisons`\n" + "Iteration 6: Largest change in params was 0.00129 in the m_probability of birth_place, level `Exact match on birth_place`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 7: Largest change in params was -0.000577 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + "Iteration 7: Largest change in params was -0.000682 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 8: Largest change in params was -0.000313 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + "Iteration 8: Largest change in params was -0.000373 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 9: Largest change in params was -0.000169 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + "Iteration 9: Largest change in params was -0.000203 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 10: Largest change in params was -9.14e-05 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + "Iteration 10: Largest change in params was -0.000111 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was -6.05e-05 in the m_probability of dob, level `Abs difference of 'transformed dob <= 10 year'`\n" ] }, { @@ -660,7 +651,7 @@ "output_type": "stream", "text": [ "\n", - "EM converged after 10 iterations\n" + "EM converged after 11 iterations\n" ] }, { @@ -686,10 +677,10 @@ "execution_count": 9, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:25.119154Z", - "iopub.status.busy": "2024-03-27T15:11:25.118860Z", - "iopub.status.idle": "2024-03-27T15:11:37.074776Z", - "shell.execute_reply": "2024-03-27T15:11:37.073970Z" + "iopub.execute_input": "2024-05-15T16:07:16.292730Z", + "iopub.status.busy": "2024-05-15T16:07:16.292472Z", + "iopub.status.idle": "2024-05-15T16:07:26.076237Z", + "shell.execute_reply": "2024-05-15T16:07:26.075402Z" } }, "outputs": [ @@ -720,20 +711,6 @@ " - dob\n" ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8d093bba3d464dafaebcaeb55dcbef47", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", @@ -752,35 +729,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was 0.0345 in the m_probability of first_name, level `All other comparisons`\n" + "Iteration 2: Largest change in params was 0.0343 in the m_probability of first_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 0.00494 in the m_probability of first_name, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 0.00489 in the m_probability of first_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 4: Largest change in params was 0.0011 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 4: Largest change in params was 0.00109 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 5: Largest change in params was 0.000264 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 5: Largest change in params was 0.000261 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 6: Largest change in params was 6.19e-05 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 6: Largest change in params was 6.1e-05 in the m_probability of surname, level `All other comparisons`\n" ] }, { @@ -820,10 +797,10 @@ "execution_count": 10, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:37.078815Z", - "iopub.status.busy": "2024-03-27T15:11:37.078506Z", - "iopub.status.idle": "2024-03-27T15:11:37.402709Z", - "shell.execute_reply": "2024-03-27T15:11:37.402055Z" + "iopub.execute_input": "2024-05-15T16:07:26.079934Z", + "iopub.status.busy": "2024-05-15T16:07:26.079660Z", + "iopub.status.idle": "2024-05-15T16:07:26.364087Z", + "shell.execute_reply": "2024-05-15T16:07:26.363559Z" } }, "outputs": [ @@ -832,23 +809,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -915,10 +892,10 @@ "execution_count": 11, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:37.405982Z", - "iopub.status.busy": "2024-03-27T15:11:37.405730Z", - "iopub.status.idle": "2024-03-27T15:11:39.611087Z", - "shell.execute_reply": "2024-03-27T15:11:39.610345Z" + "iopub.execute_input": "2024-05-15T16:07:26.367083Z", + "iopub.status.busy": "2024-05-15T16:07:26.366860Z", + "iopub.status.idle": "2024-05-15T16:07:28.387226Z", + "shell.execute_reply": "2024-05-15T16:07:28.386186Z" } }, "outputs": [ @@ -927,23 +904,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1010,27 +987,13 @@ "execution_count": 12, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:39.616427Z", - "iopub.status.busy": "2024-03-27T15:11:39.616124Z", - "iopub.status.idle": "2024-03-27T15:11:42.409572Z", - "shell.execute_reply": "2024-03-27T15:11:42.408908Z" + "iopub.execute_input": "2024-05-15T16:07:28.393039Z", + "iopub.status.busy": "2024-05-15T16:07:28.392726Z", + "iopub.status.idle": "2024-05-15T16:07:30.731337Z", + "shell.execute_reply": "2024-05-15T16:07:30.730612Z" } }, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9fe5f6a7b06a455fa9fb04d4088d3a78", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1078,121 +1041,121 @@ " \n", " \n", " 0\n", - " -15.823487\n", + " -15.840427\n", " 0.000017\n", - " Q7528564-9\n", - " Q75867928-1\n", + " Q5971253-3\n", + " Q75867928-4\n", " sir\n", " sir\n", " 3\n", " 0.024985\n", " 0.024985\n", - " 39.000166\n", + " 44.906565\n", " ...\n", - " 0.157078\n", + " 0.156756\n", " 1.0\n", - " historian\n", + " naval officer\n", " military officer\n", " 0\n", - " 0.012456\n", + " 0.009451\n", " 0.010756\n", - " 0.105426\n", + " 0.104989\n", " 1.0\n", " 0\n", " \n", " \n", " 1\n", - " -15.823487\n", + " -15.840427\n", " 0.000017\n", - " Q7528564-9\n", - " Q75867928-2\n", + " Q5971253-3\n", + " Q75867928-7\n", " sir\n", " sir\n", " 3\n", " 0.024985\n", " 0.024985\n", - " 39.000166\n", + " 44.906565\n", " ...\n", - " 0.157078\n", + " 0.156756\n", " 1.0\n", - " historian\n", + " naval officer\n", " military officer\n", " 0\n", - " 0.012456\n", + " 0.009451\n", " 0.010756\n", - " 0.105426\n", + " 0.104989\n", " 1.0\n", " 0\n", " \n", " \n", " 2\n", - " -15.823487\n", + " -15.840427\n", " 0.000017\n", - " Q7528564-9\n", - " Q75867928-3\n", + " Q5971253-2\n", + " Q75867928-4\n", " sir\n", " sir\n", " 3\n", " 0.024985\n", " 0.024985\n", - " 39.000166\n", + " 44.906565\n", " ...\n", - " 0.157078\n", + " 0.156756\n", " 1.0\n", - " historian\n", + " naval officer\n", " military officer\n", " 0\n", - " 0.012456\n", + " 0.009451\n", " 0.010756\n", - " 0.105426\n", + " 0.104989\n", " 1.0\n", " 0\n", " \n", " \n", " 3\n", - " -15.823487\n", + " -15.840427\n", " 0.000017\n", - " Q7528564-9\n", - " Q75867928-4\n", + " Q5971253-2\n", + " Q75867928-7\n", " sir\n", " sir\n", " 3\n", " 0.024985\n", " 0.024985\n", - " 39.000166\n", + " 44.906565\n", " ...\n", - " 0.157078\n", + " 0.156756\n", " 1.0\n", - " historian\n", + " naval officer\n", " military officer\n", " 0\n", - " 0.012456\n", + " 0.009451\n", " 0.010756\n", - " 0.105426\n", + " 0.104989\n", " 1.0\n", " 0\n", " \n", " \n", " 4\n", - " -15.823487\n", + " -15.840427\n", " 0.000017\n", - " Q7528564-9\n", - " Q75867928-6\n", + " Q5971253-1\n", + " Q75867928-4\n", " sir\n", " sir\n", " 3\n", " 0.024985\n", " 0.024985\n", - " 39.000166\n", + " 44.906565\n", " ...\n", - " 0.157078\n", + " 0.156756\n", " 1.0\n", - " historian\n", + " naval officer\n", " military officer\n", " 0\n", - " 0.012456\n", + " 0.009451\n", " 0.010756\n", - " 0.105426\n", + " 0.104989\n", " 1.0\n", " 0\n", " \n", @@ -1203,11 +1166,11 @@ ], "text/plain": [ " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", - "0 -15.823487 0.000017 Q7528564-9 Q75867928-1 sir \n", - "1 -15.823487 0.000017 Q7528564-9 Q75867928-2 sir \n", - "2 -15.823487 0.000017 Q7528564-9 Q75867928-3 sir \n", - "3 -15.823487 0.000017 Q7528564-9 Q75867928-4 sir \n", - "4 -15.823487 0.000017 Q7528564-9 Q75867928-6 sir \n", + "0 -15.840427 0.000017 Q5971253-3 Q75867928-4 sir \n", + "1 -15.840427 0.000017 Q5971253-3 Q75867928-7 sir \n", + "2 -15.840427 0.000017 Q5971253-2 Q75867928-4 sir \n", + "3 -15.840427 0.000017 Q5971253-2 Q75867928-7 sir \n", + "4 -15.840427 0.000017 Q5971253-1 Q75867928-4 sir \n", "\n", " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", "0 sir 3 0.024985 0.024985 \n", @@ -1216,26 +1179,26 @@ "3 sir 3 0.024985 0.024985 \n", "4 sir 3 0.024985 0.024985 \n", "\n", - " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n", - "0 39.000166 ... 0.157078 1.0 historian \n", - "1 39.000166 ... 0.157078 1.0 historian \n", - "2 39.000166 ... 0.157078 1.0 historian \n", - "3 39.000166 ... 0.157078 1.0 historian \n", - "4 39.000166 ... 0.157078 1.0 historian \n", + " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n", + "0 44.906565 ... 0.156756 1.0 naval officer \n", + "1 44.906565 ... 0.156756 1.0 naval officer \n", + "2 44.906565 ... 0.156756 1.0 naval officer \n", + "3 44.906565 ... 0.156756 1.0 naval officer \n", + "4 44.906565 ... 0.156756 1.0 naval officer \n", "\n", " occupation_r gamma_occupation tf_occupation_l tf_occupation_r \\\n", - "0 military officer 0 0.012456 0.010756 \n", - "1 military officer 0 0.012456 0.010756 \n", - "2 military officer 0 0.012456 0.010756 \n", - "3 military officer 0 0.012456 0.010756 \n", - "4 military officer 0 0.012456 0.010756 \n", + "0 military officer 0 0.009451 0.010756 \n", + "1 military officer 0 0.009451 0.010756 \n", + "2 military officer 0 0.009451 0.010756 \n", + "3 military officer 0 0.009451 0.010756 \n", + "4 military officer 0 0.009451 0.010756 \n", "\n", " bf_occupation bf_tf_adj_occupation match_key \n", - "0 0.105426 1.0 0 \n", - "1 0.105426 1.0 0 \n", - "2 0.105426 1.0 0 \n", - "3 0.105426 1.0 0 \n", - "4 0.105426 1.0 0 \n", + "0 0.104989 1.0 0 \n", + "1 0.104989 1.0 0 \n", + "2 0.104989 1.0 0 \n", + "3 0.104989 1.0 0 \n", + "4 0.104989 1.0 0 \n", "\n", "[5 rows x 41 columns]" ] @@ -1264,10 +1227,10 @@ "execution_count": 13, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:42.413681Z", - "iopub.status.busy": "2024-03-27T15:11:42.413370Z", - "iopub.status.idle": "2024-03-27T15:11:43.198735Z", - "shell.execute_reply": "2024-03-27T15:11:43.197945Z" + "iopub.execute_input": "2024-05-15T16:07:30.735380Z", + "iopub.status.busy": "2024-05-15T16:07:30.735079Z", + "iopub.status.idle": "2024-05-15T16:07:31.361460Z", + "shell.execute_reply": "2024-05-15T16:07:31.360879Z" } }, "outputs": [ @@ -1276,23 +1239,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1351,7 +1314,6 @@ } ], "source": [ - "from splink.charts import waterfall_chart\n", "\n", "records_to_plot = df_e.to_dict(orient=\"records\")\n", "linker.waterfall_chart(records_to_plot, filter_nulls=False)" @@ -1362,10 +1324,10 @@ "execution_count": 14, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:43.202898Z", - "iopub.status.busy": "2024-03-27T15:11:43.202514Z", - "iopub.status.idle": "2024-03-27T15:11:43.649732Z", - "shell.execute_reply": "2024-03-27T15:11:43.649037Z" + "iopub.execute_input": "2024-05-15T16:07:31.364481Z", + "iopub.status.busy": "2024-05-15T16:07:31.364255Z", + "iopub.status.idle": "2024-05-15T16:07:31.746356Z", + "shell.execute_reply": "2024-05-15T16:07:31.745671Z" } }, "outputs": [ @@ -1416,10 +1378,10 @@ "execution_count": 15, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:43.654088Z", - "iopub.status.busy": "2024-03-27T15:11:43.653706Z", - "iopub.status.idle": "2024-03-27T15:11:43.859304Z", - "shell.execute_reply": "2024-03-27T15:11:43.858517Z" + "iopub.execute_input": "2024-05-15T16:07:31.749625Z", + "iopub.status.busy": "2024-05-15T16:07:31.749370Z", + "iopub.status.idle": "2024-05-15T16:07:31.898014Z", + "shell.execute_reply": "2024-05-15T16:07:31.897301Z" } }, "outputs": [ @@ -1438,7 +1400,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 15, @@ -1465,49 +1427,35 @@ "execution_count": 16, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:43.863521Z", - "iopub.status.busy": "2024-03-27T15:11:43.863184Z", - "iopub.status.idle": "2024-03-27T15:11:58.747184Z", - "shell.execute_reply": "2024-03-27T15:11:58.746355Z" + "iopub.execute_input": "2024-05-15T16:07:31.901400Z", + "iopub.status.busy": "2024-05-15T16:07:31.901154Z", + "iopub.status.idle": "2024-05-15T16:07:44.228710Z", + "shell.execute_reply": "2024-05-15T16:07:44.227315Z" } }, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "06545e908438426c8185e5bc9b35b182", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1574,10 +1522,10 @@ "execution_count": 17, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:11:58.790820Z", - "iopub.status.busy": "2024-03-27T15:11:58.790513Z", - "iopub.status.idle": "2024-03-27T15:12:02.466213Z", - "shell.execute_reply": "2024-03-27T15:12:02.465677Z" + "iopub.execute_input": "2024-05-15T16:07:44.268428Z", + "iopub.status.busy": "2024-05-15T16:07:44.268099Z", + "iopub.status.idle": "2024-05-15T16:07:47.826572Z", + "shell.execute_reply": "2024-05-15T16:07:47.826055Z" } }, "outputs": [ @@ -1586,23 +1534,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1675,10 +1623,10 @@ "execution_count": 18, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:02.474301Z", - "iopub.status.busy": "2024-03-27T15:12:02.474027Z", - "iopub.status.idle": "2024-03-27T15:12:06.026315Z", - "shell.execute_reply": "2024-03-27T15:12:06.025616Z" + "iopub.execute_input": "2024-05-15T16:07:47.834324Z", + "iopub.status.busy": "2024-05-15T16:07:47.834092Z", + "iopub.status.idle": "2024-05-15T16:07:51.080047Z", + "shell.execute_reply": "2024-05-15T16:07:51.079464Z" } }, "outputs": [ @@ -1687,23 +1635,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ diff --git a/docs/demos/examples/duckdb/febrl3.ipynb b/docs/demos/examples/duckdb/febrl3.ipynb index b2201c1a48..ae12e01cbd 100644 --- a/docs/demos/examples/duckdb/febrl3.ipynb +++ b/docs/demos/examples/duckdb/febrl3.ipynb @@ -23,10 +23,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:15.921447Z", - "iopub.status.busy": "2024-03-27T15:12:15.921152Z", - "iopub.status.idle": "2024-03-27T15:12:15.940947Z", - "shell.execute_reply": "2024-03-27T15:12:15.939997Z" + "iopub.execute_input": "2024-05-15T15:50:53.970752Z", + "iopub.status.busy": "2024-05-15T15:50:53.970419Z", + "iopub.status.idle": "2024-05-15T15:50:53.975673Z", + "shell.execute_reply": "2024-05-15T15:50:53.974958Z" } }, "outputs": [], @@ -40,10 +40,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:15.945548Z", - "iopub.status.busy": "2024-03-27T15:12:15.945224Z", - "iopub.status.idle": "2024-03-27T15:12:17.598730Z", - "shell.execute_reply": "2024-03-27T15:12:17.597871Z" + "iopub.execute_input": "2024-05-15T15:50:53.979321Z", + "iopub.status.busy": "2024-05-15T15:50:53.979040Z", + "iopub.status.idle": "2024-05-15T15:50:55.403280Z", + "shell.execute_reply": "2024-05-15T15:50:55.402512Z" } }, "outputs": [ @@ -133,7 +133,6 @@ } ], "source": [ - "import pandas as pd\n", "from splink.datasets import splink_datasets\n", "\n", "df = splink_datasets.febrl3\n", @@ -152,10 +151,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:17.603215Z", - "iopub.status.busy": "2024-03-27T15:12:17.602620Z", - "iopub.status.idle": "2024-03-27T15:12:17.611176Z", - "shell.execute_reply": "2024-03-27T15:12:17.610332Z" + "iopub.execute_input": "2024-05-15T15:50:55.445888Z", + "iopub.status.busy": "2024-05-15T15:50:55.445564Z", + "iopub.status.idle": "2024-05-15T15:50:55.453559Z", + "shell.execute_reply": "2024-05-15T15:50:55.452728Z" } }, "outputs": [], @@ -169,10 +168,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:17.615178Z", - "iopub.status.busy": "2024-03-27T15:12:17.614835Z", - "iopub.status.idle": "2024-03-27T15:12:17.623118Z", - "shell.execute_reply": "2024-03-27T15:12:17.622311Z" + "iopub.execute_input": "2024-05-15T15:50:55.457023Z", + "iopub.status.busy": "2024-05-15T15:50:55.456741Z", + "iopub.status.idle": "2024-05-15T15:50:55.464209Z", + "shell.execute_reply": "2024-05-15T15:50:55.463386Z" } }, "outputs": [], @@ -186,15 +185,15 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:17.626961Z", - "iopub.status.busy": "2024-03-27T15:12:17.626653Z", - "iopub.status.idle": "2024-03-27T15:12:18.023586Z", - "shell.execute_reply": "2024-03-27T15:12:18.022831Z" + "iopub.execute_input": "2024-05-15T15:50:55.467779Z", + "iopub.status.busy": "2024-05-15T15:50:55.467486Z", + "iopub.status.idle": "2024-05-15T15:50:55.617978Z", + "shell.execute_reply": "2024-05-15T15:50:55.617331Z" } }, "outputs": [], "source": [ - "from splink import Linker, DuckDBAPI, SettingsCreator\n", + "from splink import DuckDBAPI, Linker, SettingsCreator\n", "\n", "# TODO: Allow missingness to be analysed without a linker\n", "settings = SettingsCreator(\n", @@ -217,10 +216,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:18.028485Z", - "iopub.status.busy": "2024-03-27T15:12:18.028094Z", - "iopub.status.idle": "2024-03-27T15:12:18.360467Z", - "shell.execute_reply": "2024-03-27T15:12:18.359768Z" + "iopub.execute_input": "2024-05-15T15:50:55.621604Z", + "iopub.status.busy": "2024-05-15T15:50:55.621314Z", + "iopub.status.idle": "2024-05-15T15:50:55.930689Z", + "shell.execute_reply": "2024-05-15T15:50:55.929809Z" } }, "outputs": [ @@ -229,23 +228,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -305,6 +304,7 @@ ], "source": [ "from splink.exploratory import completeness_chart\n", + "\n", "completeness_chart(df, db_api=DuckDBAPI())" ] }, @@ -313,10 +313,10 @@ "execution_count": 7, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:18.365367Z", - "iopub.status.busy": "2024-03-27T15:12:18.365013Z", - "iopub.status.idle": "2024-03-27T15:12:19.020779Z", - "shell.execute_reply": "2024-03-27T15:12:19.020018Z" + "iopub.execute_input": "2024-05-15T15:50:55.933815Z", + "iopub.status.busy": "2024-05-15T15:50:55.933588Z", + "iopub.status.idle": "2024-05-15T15:50:56.393881Z", + "shell.execute_reply": "2024-05-15T15:50:56.393363Z" } }, "outputs": [ @@ -325,23 +325,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -401,6 +401,7 @@ ], "source": [ "from splink.exploratory import profile_columns\n", + "\n", "profile_columns(df, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" ] }, @@ -409,10 +410,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:19.025236Z", - "iopub.status.busy": "2024-03-27T15:12:19.024915Z", - "iopub.status.idle": "2024-03-27T15:12:19.210387Z", - "shell.execute_reply": "2024-03-27T15:12:19.209717Z" + "iopub.execute_input": "2024-05-15T15:50:56.397337Z", + "iopub.status.busy": "2024-05-15T15:50:56.396993Z", + "iopub.status.idle": "2024-05-15T15:50:56.749566Z", + "shell.execute_reply": "2024-05-15T15:50:56.748922Z" } }, "outputs": [ @@ -421,23 +422,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -496,7 +497,10 @@ } ], "source": [ - "from splink.blocking_rule_library import block_on\n", + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", "\n", "blocking_rules = [\n", " block_on(\"soc_sec_id\"),\n", @@ -506,7 +510,14 @@ " block_on(\"postcode\"),\n", "]\n", "\n", - "linker.cumulative_num_comparisons_from_blocking_rules_chart(blocking_rules)" + "db_api = DuckDBAPI()\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rule_creators=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + " unique_id_column_name=\"rec_id\",\n", + ")" ] }, { @@ -514,18 +525,17 @@ "execution_count": 9, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:19.214929Z", - "iopub.status.busy": "2024-03-27T15:12:19.214468Z", - "iopub.status.idle": "2024-03-27T15:12:19.374722Z", - "shell.execute_reply": "2024-03-27T15:12:19.373923Z" + "iopub.execute_input": "2024-05-15T15:50:56.752854Z", + "iopub.status.busy": "2024-05-15T15:50:56.752596Z", + "iopub.status.idle": "2024-05-15T15:50:56.907514Z", + "shell.execute_reply": "2024-05-15T15:50:56.906772Z" } }, "outputs": [], "source": [ - "from splink.linker import Linker\n", "import splink.comparison_library as cl\n", "import splink.comparison_template_library as ctl\n", - "\n", + "from splink.linker import Linker\n", "\n", "settings = SettingsCreator(\n", " unique_id_column_name=\"rec_id\",\n", @@ -557,10 +567,10 @@ "execution_count": 10, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:19.378508Z", - "iopub.status.busy": "2024-03-27T15:12:19.378212Z", - "iopub.status.idle": "2024-03-27T15:12:19.627976Z", - "shell.execute_reply": "2024-03-27T15:12:19.627123Z" + "iopub.execute_input": "2024-05-15T15:50:56.910709Z", + "iopub.status.busy": "2024-05-15T15:50:56.910470Z", + "iopub.status.idle": "2024-05-15T15:50:57.119744Z", + "shell.execute_reply": "2024-05-15T15:50:57.119133Z" } }, "outputs": [ @@ -575,7 +585,6 @@ ], "source": [ "from splink.blocking_rule_library import block_on\n", - "from splink.blocking_rule_library import BlockingRuleCreator\n", "\n", "deterministic_rules = [\n", " block_on(\"soc_sec_id\"),\n", @@ -591,10 +600,10 @@ "execution_count": 11, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:19.632601Z", - "iopub.status.busy": "2024-03-27T15:12:19.632254Z", - "iopub.status.idle": "2024-03-27T15:12:25.539202Z", - "shell.execute_reply": "2024-03-27T15:12:25.538586Z" + "iopub.execute_input": "2024-05-15T15:50:57.122905Z", + "iopub.status.busy": "2024-05-15T15:50:57.122623Z", + "iopub.status.idle": "2024-05-15T15:51:01.161828Z", + "shell.execute_reply": "2024-05-15T15:51:01.161251Z" } }, "outputs": [ @@ -605,20 +614,6 @@ "----- Estimating u probabilities using random sampling -----\n" ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e181cb7618b74e4bbf9f2e144b68b87e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", @@ -672,10 +667,10 @@ "execution_count": 12, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:25.543780Z", - "iopub.status.busy": "2024-03-27T15:12:25.543498Z", - "iopub.status.idle": "2024-03-27T15:12:26.136873Z", - "shell.execute_reply": "2024-03-27T15:12:26.136073Z" + "iopub.execute_input": "2024-05-15T15:51:01.165539Z", + "iopub.status.busy": "2024-05-15T15:51:01.165298Z", + "iopub.status.idle": "2024-05-15T15:51:01.704281Z", + "shell.execute_reply": "2024-05-15T15:51:01.703690Z" } }, "outputs": [ @@ -724,21 +719,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was 0.0154 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 2: Largest change in params was 0.0152 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was -0.000671 in the m_probability of postcode, level `Exact match on postcode`\n" + "Iteration 3: Largest change in params was 0.000666 in the m_probability of postcode, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 4: Largest change in params was 3.55e-05 in the m_probability of postcode, level `All other comparisons`\n" + "Iteration 4: Largest change in params was 3.54e-05 in the m_probability of postcode, level `All other comparisons`\n" ] }, { @@ -771,10 +766,10 @@ "execution_count": 13, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:26.140706Z", - "iopub.status.busy": "2024-03-27T15:12:26.140400Z", - "iopub.status.idle": "2024-03-27T15:12:26.856071Z", - "shell.execute_reply": "2024-03-27T15:12:26.855354Z" + "iopub.execute_input": "2024-05-15T15:51:01.707325Z", + "iopub.status.busy": "2024-05-15T15:51:01.707114Z", + "iopub.status.idle": "2024-05-15T15:51:02.290513Z", + "shell.execute_reply": "2024-05-15T15:51:02.290020Z" } }, "outputs": [ @@ -843,21 +838,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was 0.0609 in probability_two_random_records_match\n" + "Iteration 1: Largest change in params was 0.0681 in probability_two_random_records_match\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was -0.00186 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n" + "Iteration 2: Largest change in params was -0.00191 in the m_probability of date_of_birth, level `Exact match on date_of_birth`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 5.24e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 5.43e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" ] }, { @@ -911,10 +906,10 @@ "execution_count": 14, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:26.861160Z", - "iopub.status.busy": "2024-03-27T15:12:26.860878Z", - "iopub.status.idle": "2024-03-27T15:12:27.222487Z", - "shell.execute_reply": "2024-03-27T15:12:27.221822Z" + "iopub.execute_input": "2024-05-15T15:51:02.294783Z", + "iopub.status.busy": "2024-05-15T15:51:02.294498Z", + "iopub.status.idle": "2024-05-15T15:51:02.665651Z", + "shell.execute_reply": "2024-05-15T15:51:02.665073Z" } }, "outputs": [ @@ -923,23 +918,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1006,27 +1001,13 @@ "execution_count": 15, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:27.226108Z", - "iopub.status.busy": "2024-03-27T15:12:27.225817Z", - "iopub.status.idle": "2024-03-27T15:12:34.784821Z", - "shell.execute_reply": "2024-03-27T15:12:34.784089Z" + "iopub.execute_input": "2024-05-15T15:51:02.668752Z", + "iopub.status.busy": "2024-05-15T15:51:02.668512Z", + "iopub.status.idle": "2024-05-15T15:51:09.240685Z", + "shell.execute_reply": "2024-05-15T15:51:09.240109Z" } }, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "94aaeff2f888492ea321d4e4492526ff", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", @@ -1050,10 +1031,10 @@ "execution_count": 16, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:34.788677Z", - "iopub.status.busy": "2024-03-27T15:12:34.788362Z", - "iopub.status.idle": "2024-03-27T15:12:38.203552Z", - "shell.execute_reply": "2024-03-27T15:12:38.202805Z" + "iopub.execute_input": "2024-05-15T15:51:09.243955Z", + "iopub.status.busy": "2024-05-15T15:51:09.243667Z", + "iopub.status.idle": "2024-05-15T15:51:11.811265Z", + "shell.execute_reply": "2024-05-15T15:51:11.810638Z" } }, "outputs": [ @@ -1075,23 +1056,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1158,10 +1139,10 @@ "execution_count": 17, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:38.214590Z", - "iopub.status.busy": "2024-03-27T15:12:38.214089Z", - "iopub.status.idle": "2024-03-27T15:12:39.005476Z", - "shell.execute_reply": "2024-03-27T15:12:39.004679Z" + "iopub.execute_input": "2024-05-15T15:51:11.820946Z", + "iopub.status.busy": "2024-05-15T15:51:11.820644Z", + "iopub.status.idle": "2024-05-15T15:51:12.084284Z", + "shell.execute_reply": "2024-05-15T15:51:12.083443Z" } }, "outputs": [ @@ -1227,8 +1208,8 @@ " 0\n", " 1.0\n", " False\n", - " -27.447353\n", - " 5.464160e-09\n", + " -27.447151\n", + " 5.464925e-09\n", " rec-993-dup-1\n", " rec-993-dup-3\n", " westbrook\n", @@ -1241,7 +1222,7 @@ " 0\n", " 0.0002\n", " 0.0014\n", - " 0.230071\n", + " 0.230072\n", " 1.0\n", " rec-993\n", " rec-993\n", @@ -1251,8 +1232,8 @@ " 1\n", " 1.0\n", " False\n", - " -27.447353\n", - " 5.464160e-09\n", + " -27.447151\n", + " 5.464925e-09\n", " rec-829-dup-0\n", " rec-829-dup-2\n", " wilde\n", @@ -1265,7 +1246,7 @@ " 0\n", " 0.0004\n", " 0.0006\n", - " 0.230071\n", + " 0.230072\n", " 1.0\n", " rec-829\n", " rec-829\n", @@ -1275,8 +1256,8 @@ " 2\n", " 1.0\n", " False\n", - " -19.359440\n", - " 1.486713e-06\n", + " -19.359354\n", + " 1.486802e-06\n", " rec-829-dup-0\n", " rec-829-dup-1\n", " wilde\n", @@ -1289,7 +1270,7 @@ " 0\n", " 0.0004\n", " 0.0002\n", - " 0.230071\n", + " 0.230072\n", " 1.0\n", " rec-829\n", " rec-829\n", @@ -1299,8 +1280,8 @@ " 3\n", " 1.0\n", " True\n", - " -15.232985\n", - " 2.596590e-05\n", + " -15.232752\n", + " 2.597009e-05\n", " rec-721-dup-0\n", " rec-721-dup-1\n", " mikhaili\n", @@ -1313,7 +1294,7 @@ " 0\n", " 0.0008\n", " 0.0014\n", - " 0.230071\n", + " 0.230072\n", " 1.0\n", " rec-721\n", " rec-721\n", @@ -1323,8 +1304,8 @@ " 4\n", " 1.0\n", " True\n", - " -12.553825\n", - " 1.662838e-04\n", + " -12.570818\n", + " 1.643370e-04\n", " rec-401-dup-1\n", " rec-401-dup-3\n", " whitbe\n", @@ -1337,7 +1318,7 @@ " 0\n", " 0.0020\n", " 0.0004\n", - " 0.230071\n", + " 0.230072\n", " 1.0\n", " rec-401\n", " rec-401\n", @@ -1350,18 +1331,18 @@ ], "text/plain": [ " clerical_match_score found_by_blocking_rules match_weight \\\n", - "0 1.0 False -27.447353 \n", - "1 1.0 False -27.447353 \n", - "2 1.0 False -19.359440 \n", - "3 1.0 True -15.232985 \n", - "4 1.0 True -12.553825 \n", + "0 1.0 False -27.447151 \n", + "1 1.0 False -27.447151 \n", + "2 1.0 False -19.359354 \n", + "3 1.0 True -15.232752 \n", + "4 1.0 True -12.570818 \n", "\n", " match_probability rec_id_l rec_id_r given_name_l given_name_r \\\n", - "0 5.464160e-09 rec-993-dup-1 rec-993-dup-3 westbrook jake \n", - "1 5.464160e-09 rec-829-dup-0 rec-829-dup-2 wilde kyra \n", - "2 1.486713e-06 rec-829-dup-0 rec-829-dup-1 wilde kyra \n", - "3 2.596590e-05 rec-721-dup-0 rec-721-dup-1 mikhaili elly \n", - "4 1.662838e-04 rec-401-dup-1 rec-401-dup-3 whitbe alexa-ose \n", + "0 5.464925e-09 rec-993-dup-1 rec-993-dup-3 westbrook jake \n", + "1 5.464925e-09 rec-829-dup-0 rec-829-dup-2 wilde kyra \n", + "2 1.486802e-06 rec-829-dup-0 rec-829-dup-1 wilde kyra \n", + "3 2.597009e-05 rec-721-dup-0 rec-721-dup-1 mikhaili elly \n", + "4 1.643370e-04 rec-401-dup-1 rec-401-dup-3 whitbe alexa-ose \n", "\n", " gamma_given_name tf_given_name_l ... postcode_l postcode_r \\\n", "0 0 0.0004 ... 2704 2074 \n", @@ -1371,11 +1352,11 @@ "4 0 0.0002 ... 3040 3041 \n", "\n", " gamma_postcode tf_postcode_l tf_postcode_r bf_postcode \\\n", - "0 0 0.0002 0.0014 0.230071 \n", - "1 0 0.0004 0.0006 0.230071 \n", - "2 0 0.0004 0.0002 0.230071 \n", - "3 0 0.0008 0.0014 0.230071 \n", - "4 0 0.0020 0.0004 0.230071 \n", + "0 0 0.0002 0.0014 0.230072 \n", + "1 0 0.0004 0.0006 0.230072 \n", + "2 0 0.0004 0.0002 0.230072 \n", + "3 0 0.0008 0.0014 0.230072 \n", + "4 0 0.0020 0.0004 0.230072 \n", "\n", " bf_tf_adj_postcode cluster_l cluster_r match_key \n", "0 1.0 rec-993 rec-993 5 \n", @@ -1405,10 +1386,10 @@ "execution_count": 18, "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:39.009486Z", - "iopub.status.busy": "2024-03-27T15:12:39.009173Z", - "iopub.status.idle": "2024-03-27T15:12:40.551056Z", - "shell.execute_reply": "2024-03-27T15:12:40.550285Z" + "iopub.execute_input": "2024-05-15T15:51:12.087291Z", + "iopub.status.busy": "2024-05-15T15:51:12.087021Z", + "iopub.status.idle": "2024-05-15T15:51:13.092062Z", + "shell.execute_reply": "2024-05-15T15:51:13.091503Z" } }, "outputs": [ @@ -1430,23 +1411,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ diff --git a/docs/demos/examples/duckdb/febrl4.ipynb b/docs/demos/examples/duckdb/febrl4.ipynb index 9c55c3ec9b..7761fdc413 100644 --- a/docs/demos/examples/duckdb/febrl4.ipynb +++ b/docs/demos/examples/duckdb/febrl4.ipynb @@ -30,10 +30,10 @@ "id": "9c2be649", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:43.519722Z", - "iopub.status.busy": "2024-03-27T15:12:43.519314Z", - "iopub.status.idle": "2024-03-27T15:12:43.540219Z", - "shell.execute_reply": "2024-03-27T15:12:43.539395Z" + "iopub.execute_input": "2024-05-15T15:56:42.115992Z", + "iopub.status.busy": "2024-05-15T15:56:42.115623Z", + "iopub.status.idle": "2024-05-15T15:56:42.138818Z", + "shell.execute_reply": "2024-05-15T15:56:42.137554Z" } }, "outputs": [], @@ -64,10 +64,10 @@ "id": "832113c9-13b2-43b7-86d0-6051a9db79e8", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:43.544649Z", - "iopub.status.busy": "2024-03-27T15:12:43.544328Z", - "iopub.status.idle": "2024-03-27T15:12:45.161474Z", - "shell.execute_reply": "2024-03-27T15:12:45.160546Z" + "iopub.execute_input": "2024-05-15T15:56:42.144735Z", + "iopub.status.busy": "2024-05-15T15:56:42.144299Z", + "iopub.status.idle": "2024-05-15T15:56:44.123585Z", + "shell.execute_reply": "2024-05-15T15:56:44.122726Z" } }, "outputs": [ @@ -243,10 +243,7 @@ } ], "source": [ - "import pandas as pd\n", - "import altair as alt\n", "from splink import splink_datasets\n", - "from IPython.display import IFrame\n", "\n", "df_a = splink_datasets.febrl4a\n", "df_b = splink_datasets.febrl4b\n", @@ -281,15 +278,15 @@ "id": "3233c3e1-3e6b-4abc-8bed-c26e8b463c2a", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:45.166277Z", - "iopub.status.busy": "2024-03-27T15:12:45.165910Z", - "iopub.status.idle": "2024-03-27T15:12:45.605472Z", - "shell.execute_reply": "2024-03-27T15:12:45.604577Z" + "iopub.execute_input": "2024-05-15T15:56:44.128064Z", + "iopub.status.busy": "2024-05-15T15:56:44.127470Z", + "iopub.status.idle": "2024-05-15T15:56:44.412449Z", + "shell.execute_reply": "2024-05-15T15:56:44.410927Z" } }, "outputs": [], "source": [ - "from splink import Linker, DuckDBAPI, SettingsCreator\n", + "from splink import DuckDBAPI, Linker, SettingsCreator\n", "\n", "basic_settings = SettingsCreator(\n", " unique_id_column_name=\"rec_id\",\n", @@ -318,10 +315,10 @@ "id": "319ffdbc-7853-40a9-b331-e635d96b6fdc", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:45.609844Z", - "iopub.status.busy": "2024-03-27T15:12:45.609512Z", - "iopub.status.idle": "2024-03-27T15:12:46.350606Z", - "shell.execute_reply": "2024-03-27T15:12:46.349830Z" + "iopub.execute_input": "2024-05-15T15:56:44.418048Z", + "iopub.status.busy": "2024-05-15T15:56:44.417174Z", + "iopub.status.idle": "2024-05-15T15:56:45.018140Z", + "shell.execute_reply": "2024-05-15T15:56:45.017233Z" } }, "outputs": [ @@ -330,23 +327,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -406,6 +403,7 @@ ], "source": [ "from splink.exploratory import completeness_chart\n", + "\n", "completeness_chart(dfs, db_api=DuckDBAPI())" ] }, @@ -415,10 +413,10 @@ "id": "dff8dfca-57c8-42bf-878c-da9dd23d2682", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:46.354377Z", - "iopub.status.busy": "2024-03-27T15:12:46.354086Z", - "iopub.status.idle": "2024-03-27T15:12:47.438524Z", - "shell.execute_reply": "2024-03-27T15:12:47.436625Z" + "iopub.execute_input": "2024-05-15T15:56:45.022368Z", + "iopub.status.busy": "2024-05-15T15:56:45.021805Z", + "iopub.status.idle": "2024-05-15T15:56:45.760354Z", + "shell.execute_reply": "2024-05-15T15:56:45.759671Z" } }, "outputs": [ @@ -427,23 +425,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -503,6 +501,7 @@ ], "source": [ "from splink.exploratory import profile_columns\n", + "\n", "profile_columns(dfs, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" ] }, @@ -522,10 +521,10 @@ "id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:47.443398Z", - "iopub.status.busy": "2024-03-27T15:12:47.443059Z", - "iopub.status.idle": "2024-03-27T15:12:48.111729Z", - "shell.execute_reply": "2024-03-27T15:12:48.111012Z" + "iopub.execute_input": "2024-05-15T15:56:45.764541Z", + "iopub.status.busy": "2024-05-15T15:56:45.764220Z", + "iopub.status.idle": "2024-05-15T15:56:46.595508Z", + "shell.execute_reply": "2024-05-15T15:56:46.594573Z" } }, "outputs": [ @@ -534,23 +533,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -609,8 +608,10 @@ } ], "source": [ - "from splink.blocking_rule_library import block_on\n", - "from splink.blocking_rule_library import BlockingRuleCreator\n", + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", "\n", "blocking_rules = [\n", " block_on(\"given_name\", \"surname\"),\n", @@ -624,7 +625,15 @@ "]\n", "\n", "\n", - "linker.cumulative_num_comparisons_from_blocking_rules_chart(blocking_rules)" + "db_api = DuckDBAPI()\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=dfs,\n", + " blocking_rule_creators=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"rec_id\",\n", + " source_dataset_column_name=\"source_dataset\",\n", + ")" ] }, { @@ -652,17 +661,17 @@ "id": "f6360b69-2d52-4f1a-9199-2edf2339ec63", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:48.116112Z", - "iopub.status.busy": "2024-03-27T15:12:48.115772Z", - "iopub.status.idle": "2024-03-27T15:12:48.586460Z", - "shell.execute_reply": "2024-03-27T15:12:48.585386Z" + "iopub.execute_input": "2024-05-15T15:56:46.600071Z", + "iopub.status.busy": "2024-05-15T15:56:46.599766Z", + "iopub.status.idle": "2024-05-15T15:56:47.112399Z", + "shell.execute_reply": "2024-05-15T15:56:47.111220Z" } }, "outputs": [], "source": [ + "import splink.comparison_level_library as cll\n", "import splink.comparison_library as cl\n", "import splink.comparison_template_library as ctl\n", - "import splink.comparison_level_library as cll\n", "\n", "# the simple model only considers a few columns, and only two comparison levels for each\n", "simple_model_settings = SettingsCreator(\n", @@ -741,10 +750,10 @@ "id": "7ad48419-4eda-4fe5-b00f-2ec9f798e0e8", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:48.591383Z", - "iopub.status.busy": "2024-03-27T15:12:48.591068Z", - "iopub.status.idle": "2024-03-27T15:12:49.136985Z", - "shell.execute_reply": "2024-03-27T15:12:49.136255Z" + "iopub.execute_input": "2024-05-15T15:56:47.118143Z", + "iopub.status.busy": "2024-05-15T15:56:47.117804Z", + "iopub.status.idle": "2024-05-15T15:56:47.491169Z", + "shell.execute_reply": "2024-05-15T15:56:47.489974Z" } }, "outputs": [ @@ -790,10 +799,10 @@ "id": "e40ee288-0c42-4cda-aaf1-3ffb2ea02383", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:49.141055Z", - "iopub.status.busy": "2024-03-27T15:12:49.140723Z", - "iopub.status.idle": "2024-03-27T15:12:56.032430Z", - "shell.execute_reply": "2024-03-27T15:12:56.030840Z" + "iopub.execute_input": "2024-05-15T15:56:47.497349Z", + "iopub.status.busy": "2024-05-15T15:56:47.496965Z", + "iopub.status.idle": "2024-05-15T15:56:59.095072Z", + "shell.execute_reply": "2024-05-15T15:56:59.094337Z" } }, "outputs": [ @@ -804,20 +813,6 @@ "----- Estimating u probabilities using random sampling -----\n" ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cda32ebef57a4bcb9b8d2d531ac2b32a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", @@ -884,10 +879,10 @@ "id": "9ee0f49b-084c-45aa-8c6b-ec5da11c2cc4", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:56.038240Z", - "iopub.status.busy": "2024-03-27T15:12:56.037898Z", - "iopub.status.idle": "2024-03-27T15:12:57.769556Z", - "shell.execute_reply": "2024-03-27T15:12:57.768213Z" + "iopub.execute_input": "2024-05-15T15:56:59.100504Z", + "iopub.status.busy": "2024-05-15T15:56:59.100174Z", + "iopub.status.idle": "2024-05-15T15:57:01.059609Z", + "shell.execute_reply": "2024-05-15T15:57:01.058521Z" } }, "outputs": [ @@ -929,21 +924,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was -0.294 in probability_two_random_records_match\n" + "Iteration 1: Largest change in params was -0.312 in probability_two_random_records_match\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was 0.00377 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 2: Largest change in params was 0.00363 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 9.07e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 8.66e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" ] }, { @@ -1035,14 +1030,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was 0.000496 in the m_probability of date_of_birth, level `All other comparisons`\n" + "Iteration 2: Largest change in params was 0.000462 in the m_probability of date_of_birth, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 9.69e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 8.02e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n" ] }, { @@ -1107,10 +1102,10 @@ "id": "31ef6844-6be8-4f01-9ff7-5dfebcf12ae1", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:57.776744Z", - "iopub.status.busy": "2024-03-27T15:12:57.776217Z", - "iopub.status.idle": "2024-03-27T15:12:58.140721Z", - "shell.execute_reply": "2024-03-27T15:12:58.139796Z" + "iopub.execute_input": "2024-05-15T15:57:01.065654Z", + "iopub.status.busy": "2024-05-15T15:57:01.065325Z", + "iopub.status.idle": "2024-05-15T15:57:01.389061Z", + "shell.execute_reply": "2024-05-15T15:57:01.388339Z" } }, "outputs": [ @@ -1119,23 +1114,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1211,10 +1206,10 @@ "id": "8d260a60-a4fa-4c0d-9853-8b8256a24257", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:58.145743Z", - "iopub.status.busy": "2024-03-27T15:12:58.145251Z", - "iopub.status.idle": "2024-03-27T15:12:58.304185Z", - "shell.execute_reply": "2024-03-27T15:12:58.303499Z" + "iopub.execute_input": "2024-05-15T15:57:01.393145Z", + "iopub.status.busy": "2024-05-15T15:57:01.392842Z", + "iopub.status.idle": "2024-05-15T15:57:01.561233Z", + "shell.execute_reply": "2024-05-15T15:57:01.560475Z" } }, "outputs": [ @@ -1223,23 +1218,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1315,10 +1310,10 @@ "id": "71f2f166-05cd-4038-a289-a053a1f0b5c5", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:12:58.308373Z", - "iopub.status.busy": "2024-03-27T15:12:58.308067Z", - "iopub.status.idle": "2024-03-27T15:13:01.539459Z", - "shell.execute_reply": "2024-03-27T15:13:01.538548Z" + "iopub.execute_input": "2024-05-15T15:57:01.565611Z", + "iopub.status.busy": "2024-05-15T15:57:01.565220Z", + "iopub.status.idle": "2024-05-15T15:57:04.177024Z", + "shell.execute_reply": "2024-05-15T15:57:04.176371Z" } }, "outputs": [ @@ -1391,168 +1386,175 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was -0.0811 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 1: Largest change in params was 0.0821 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was -0.0258 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 2: Largest change in params was -0.0237 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 0.0243 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 0.0222 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 4: Largest change in params was -0.0224 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 4: Largest change in params was -0.0205 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 5: Largest change in params was 0.0195 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 5: Largest change in params was 0.018 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 6: Largest change in params was -0.0162 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 6: Largest change in params was 0.0151 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 7: Largest change in params was -0.0129 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 7: Largest change in params was -0.0123 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 8: Largest change in params was 0.01 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 8: Largest change in params was -0.0097 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 9: Largest change in params was -0.00764 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 9: Largest change in params was 0.00751 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 10: Largest change in params was -0.00574 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 10: Largest change in params was 0.00573 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 11: Largest change in params was 0.00427 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 11: Largest change in params was 0.00434 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 12: Largest change in params was -0.00317 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 12: Largest change in params was -0.00326 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 13: Largest change in params was -0.00234 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 13: Largest change in params was 0.00245 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 14: Largest change in params was -0.00173 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 14: Largest change in params was -0.00183 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 15: Largest change in params was 0.00128 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 15: Largest change in params was 0.00137 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 16: Largest change in params was -0.000948 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 16: Largest change in params was -0.00103 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 17: Largest change in params was 0.000703 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 17: Largest change in params was -0.000769 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 18: Largest change in params was -0.000521 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 18: Largest change in params was -0.000576 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 19: Largest change in params was -0.000387 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 19: Largest change in params was 0.000432 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 20: Largest change in params was -0.000288 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 20: Largest change in params was -0.000324 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 21: Largest change in params was 0.000214 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 21: Largest change in params was 0.000243 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 22: Largest change in params was -0.000159 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 22: Largest change in params was -0.000182 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 23: Largest change in params was -0.000118 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 23: Largest change in params was -0.000137 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 24: Largest change in params was 8.82e-05 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 24: Largest change in params was -0.000103 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 25: Largest change in params was -7.73e-05 in the m_probability of surname, level `Exact match on surname`\n" ] }, { @@ -1560,7 +1562,7 @@ "output_type": "stream", "text": [ "\n", - "EM converged after 24 iterations\n" + "EM converged after 25 iterations\n" ] }, { @@ -1607,175 +1609,175 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was -0.0461 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 1: Largest change in params was 0.0513 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was -0.0276 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 2: Largest change in params was -0.025 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was -0.0242 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 3: Largest change in params was -0.0251 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 4: Largest change in params was -0.0236 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 4: Largest change in params was 0.0246 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 5: Largest change in params was 0.0217 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 5: Largest change in params was 0.0228 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 6: Largest change in params was 0.019 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 6: Largest change in params was -0.0199 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 7: Largest change in params was -0.0159 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 7: Largest change in params was -0.0166 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 8: Largest change in params was 0.0128 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 8: Largest change in params was 0.0134 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 9: Largest change in params was -0.0101 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 9: Largest change in params was -0.0105 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 10: Largest change in params was -0.00774 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 10: Largest change in params was -0.00801 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 11: Largest change in params was 0.00588 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 11: Largest change in params was 0.00607 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 12: Largest change in params was -0.00443 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 12: Largest change in params was -0.00457 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 13: Largest change in params was -0.00333 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 13: Largest change in params was 0.00344 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 14: Largest change in params was -0.00251 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 14: Largest change in params was 0.0026 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 15: Largest change in params was 0.0019 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 15: Largest change in params was -0.00197 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 16: Largest change in params was -0.00144 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 16: Largest change in params was 0.0015 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 17: Largest change in params was -0.0011 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 17: Largest change in params was -0.00115 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 18: Largest change in params was 0.000851 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 18: Largest change in params was 0.00089 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 19: Largest change in params was -0.00066 in the m_probability of given_name, level `Exact match on given_name`\n" + "Iteration 19: Largest change in params was -0.000693 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 20: Largest change in params was 0.000515 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 20: Largest change in params was 0.000542 in the m_probability of given_name, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 21: Largest change in params was 0.000404 in the m_probability of given_name, level `All other comparisons`\n" + "Iteration 21: Largest change in params was -0.000426 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 22: Largest change in params was -0.000324 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 22: Largest change in params was -0.000337 in the m_probability of given_name, level `Exact match on given_name`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 23: Largest change in params was 0.000269 in the m_probability of surname, level `All other comparisons`\n" + "Iteration 23: Largest change in params was 0.000274 in the m_probability of surname, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 24: Largest change in params was -0.000222 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 24: Largest change in params was -0.000224 in the m_probability of surname, level `Exact match on surname`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 25: Largest change in params was -0.000183 in the m_probability of surname, level `Exact match on surname`\n" + "Iteration 25: Largest change in params was 0.000182 in the m_probability of surname, level `All other comparisons`\n" ] }, { @@ -1799,23 +1801,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1893,10 +1895,10 @@ "id": "3a87cb78-0e97-40a3-b757-6c99bb19d7b1", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:01.544092Z", - "iopub.status.busy": "2024-03-27T15:13:01.543746Z", - "iopub.status.idle": "2024-03-27T15:13:01.547981Z", - "shell.execute_reply": "2024-03-27T15:13:01.546972Z" + "iopub.execute_input": "2024-05-15T15:57:04.180496Z", + "iopub.status.busy": "2024-05-15T15:57:04.180247Z", + "iopub.status.idle": "2024-05-15T15:57:04.183145Z", + "shell.execute_reply": "2024-05-15T15:57:04.182523Z" } }, "outputs": [], @@ -1925,10 +1927,10 @@ "id": "b17b131c-c83e-4c32-bfad-c12021d2c3b7", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:01.552731Z", - "iopub.status.busy": "2024-03-27T15:13:01.552391Z", - "iopub.status.idle": "2024-03-27T15:13:01.922769Z", - "shell.execute_reply": "2024-03-27T15:13:01.921800Z" + "iopub.execute_input": "2024-05-15T15:57:04.186220Z", + "iopub.status.busy": "2024-05-15T15:57:04.185782Z", + "iopub.status.idle": "2024-05-15T15:57:04.541188Z", + "shell.execute_reply": "2024-05-15T15:57:04.540169Z" } }, "outputs": [ @@ -1937,23 +1939,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -2021,10 +2023,10 @@ "id": "c095ff2b-405b-427c-849f-1468f6ca98e0", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:01.928496Z", - "iopub.status.busy": "2024-03-27T15:13:01.927322Z", - "iopub.status.idle": "2024-03-27T15:13:02.357636Z", - "shell.execute_reply": "2024-03-27T15:13:02.356845Z" + "iopub.execute_input": "2024-05-15T15:57:04.545921Z", + "iopub.status.busy": "2024-05-15T15:57:04.545071Z", + "iopub.status.idle": "2024-05-15T15:57:04.888788Z", + "shell.execute_reply": "2024-05-15T15:57:04.887944Z" } }, "outputs": [ @@ -2033,23 +2035,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -2127,10 +2129,10 @@ "id": "26e5dbe5-a621-44ab-bdb4-0bcd53b220b6", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:02.362064Z", - "iopub.status.busy": "2024-03-27T15:13:02.361708Z", - "iopub.status.idle": "2024-03-27T15:13:02.614930Z", - "shell.execute_reply": "2024-03-27T15:13:02.614265Z" + "iopub.execute_input": "2024-05-15T15:57:04.893722Z", + "iopub.status.busy": "2024-05-15T15:57:04.893207Z", + "iopub.status.idle": "2024-05-15T15:57:05.067224Z", + "shell.execute_reply": "2024-05-15T15:57:05.066686Z" } }, "outputs": [ @@ -2139,23 +2141,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -2234,10 +2236,10 @@ "id": "149962d6-a2ad-412f-aa05-8697beb12ed0", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:02.619029Z", - "iopub.status.busy": "2024-03-27T15:13:02.618710Z", - "iopub.status.idle": "2024-03-27T15:13:04.865112Z", - "shell.execute_reply": "2024-03-27T15:13:04.864392Z" + "iopub.execute_input": "2024-05-15T15:57:05.070283Z", + "iopub.status.busy": "2024-05-15T15:57:05.070040Z", + "iopub.status.idle": "2024-05-15T15:57:06.960773Z", + "shell.execute_reply": "2024-05-15T15:57:06.959848Z" } }, "outputs": [ @@ -2246,23 +2248,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -2330,10 +2332,10 @@ "id": "cac493dd-ea43-4550-8fd4-f758ae90ed75", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:04.868863Z", - "iopub.status.busy": "2024-03-27T15:13:04.868574Z", - "iopub.status.idle": "2024-03-27T15:13:05.268356Z", - "shell.execute_reply": "2024-03-27T15:13:05.267646Z" + "iopub.execute_input": "2024-05-15T15:57:06.965159Z", + "iopub.status.busy": "2024-05-15T15:57:06.964863Z", + "iopub.status.idle": "2024-05-15T15:57:07.337075Z", + "shell.execute_reply": "2024-05-15T15:57:07.336337Z" } }, "outputs": [ @@ -2342,23 +2344,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -2841,10 +2829,10 @@ "id": "ade53248-212f-4776-8d7d-4632b1749425", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:14.830877Z", - "iopub.status.busy": "2024-03-27T15:13:14.830361Z", - "iopub.status.idle": "2024-03-27T15:13:15.187246Z", - "shell.execute_reply": "2024-03-27T15:13:15.186429Z" + "iopub.execute_input": "2024-05-15T15:57:15.183049Z", + "iopub.status.busy": "2024-05-15T15:57:15.182695Z", + "iopub.status.idle": "2024-05-15T15:57:15.493444Z", + "shell.execute_reply": "2024-05-15T15:57:15.492713Z" } }, "outputs": [ @@ -2858,8 +2846,8 @@ { "data": { "text/plain": [ - "2 4956\n", - "1 88\n", + "2 4958\n", + "1 84\n", "Name: count, dtype: int64" ] }, @@ -2894,10 +2882,10 @@ "id": "ef77a8b1-1119-4cb0-b299-343a4022d65e", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:15.191801Z", - "iopub.status.busy": "2024-03-27T15:13:15.191422Z", - "iopub.status.idle": "2024-03-27T15:13:15.213725Z", - "shell.execute_reply": "2024-03-27T15:13:15.212992Z" + "iopub.execute_input": "2024-05-15T15:57:15.500107Z", + "iopub.status.busy": "2024-05-15T15:57:15.499499Z", + "iopub.status.idle": "2024-05-15T15:57:15.523366Z", + "shell.execute_reply": "2024-05-15T15:57:15.522625Z" } }, "outputs": [], @@ -2919,10 +2907,10 @@ "id": "bc531ca3-fe0d-480d-b059-a7125474fb22", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:15.217764Z", - "iopub.status.busy": "2024-03-27T15:13:15.217099Z", - "iopub.status.idle": "2024-03-27T15:13:15.892736Z", - "shell.execute_reply": "2024-03-27T15:13:15.892017Z" + "iopub.execute_input": "2024-05-15T15:57:15.527453Z", + "iopub.status.busy": "2024-05-15T15:57:15.527121Z", + "iopub.status.idle": "2024-05-15T15:57:16.507088Z", + "shell.execute_reply": "2024-05-15T15:57:16.506251Z" } }, "outputs": [ @@ -2931,23 +2919,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -3018,10 +3006,10 @@ "id": "aacd9042-5672-4bc4-aa98-940d1f5fd28a", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:15.896990Z", - "iopub.status.busy": "2024-03-27T15:13:15.896660Z", - "iopub.status.idle": "2024-03-27T15:13:16.508040Z", - "shell.execute_reply": "2024-03-27T15:13:16.507092Z" + "iopub.execute_input": "2024-05-15T15:57:16.510992Z", + "iopub.status.busy": "2024-05-15T15:57:16.510681Z", + "iopub.status.idle": "2024-05-15T15:57:17.322254Z", + "shell.execute_reply": "2024-05-15T15:57:17.321456Z" } }, "outputs": [ @@ -3030,23 +3018,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -3137,10 +3125,10 @@ "id": "2a7229da-9f79-4151-a6b1-018d17205f5f", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:16.512359Z", - "iopub.status.busy": "2024-03-27T15:13:16.511999Z", - "iopub.status.idle": "2024-03-27T15:13:16.526408Z", - "shell.execute_reply": "2024-03-27T15:13:16.525600Z" + "iopub.execute_input": "2024-05-15T15:57:17.327035Z", + "iopub.status.busy": "2024-05-15T15:57:17.326665Z", + "iopub.status.idle": "2024-05-15T15:57:17.342204Z", + "shell.execute_reply": "2024-05-15T15:57:17.341227Z" } }, "outputs": [], @@ -3243,10 +3231,10 @@ "id": "1581eeeb-246b-46de-be88-ba4dc821fce7", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:13:16.530152Z", - "iopub.status.busy": "2024-03-27T15:13:16.529860Z", - "iopub.status.idle": "2024-03-27T15:14:32.704293Z", - "shell.execute_reply": "2024-03-27T15:14:32.703556Z" + "iopub.execute_input": "2024-05-15T15:57:17.346493Z", + "iopub.status.busy": "2024-05-15T15:57:17.346091Z", + "iopub.status.idle": "2024-05-15T15:58:52.238122Z", + "shell.execute_reply": "2024-05-15T15:58:52.237374Z" } }, "outputs": [ @@ -3265,20 +3253,6 @@ "----- Estimating u probabilities using random sampling -----\n" ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "886192be2bdf4a88a1d8808f1db44fb2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", @@ -3341,10 +3315,10 @@ "id": "265f0651", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:14:32.708880Z", - "iopub.status.busy": "2024-03-27T15:14:32.708590Z", - "iopub.status.idle": "2024-03-27T15:14:33.471788Z", - "shell.execute_reply": "2024-03-27T15:14:33.471075Z" + "iopub.execute_input": "2024-05-15T15:58:52.244579Z", + "iopub.status.busy": "2024-05-15T15:58:52.244307Z", + "iopub.status.idle": "2024-05-15T15:58:53.189566Z", + "shell.execute_reply": "2024-05-15T15:58:53.188815Z" } }, "outputs": [ @@ -3408,7 +3382,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 4.95e-05 in the m_probability of Social security ID, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 4.91e-05 in the m_probability of Social security ID, level `All other comparisons`\n" ] }, { @@ -3449,10 +3423,10 @@ "id": "ebcb15c8", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:14:33.475903Z", - "iopub.status.busy": "2024-03-27T15:14:33.475574Z", - "iopub.status.idle": "2024-03-27T15:14:34.945840Z", - "shell.execute_reply": "2024-03-27T15:14:34.944366Z" + "iopub.execute_input": "2024-05-15T15:58:53.193304Z", + "iopub.status.busy": "2024-05-15T15:58:53.193012Z", + "iopub.status.idle": "2024-05-15T15:58:54.287492Z", + "shell.execute_reply": "2024-05-15T15:58:54.286732Z" } }, "outputs": [ @@ -3536,14 +3510,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 2: Largest change in params was 0.000647 in the m_probability of date_of_birth, level `All other comparisons`\n" + "Iteration 2: Largest change in params was 0.000635 in the m_probability of date_of_birth, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 3: Largest change in params was 1.72e-05 in the m_probability of Social security ID, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 1.68e-05 in the m_probability of Social security ID, level `All other comparisons`\n" ] }, { @@ -3605,10 +3579,10 @@ "id": "d9d21e85-b89b-435a-8b75-142166ac3f31", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:14:34.954963Z", - "iopub.status.busy": "2024-03-27T15:14:34.954576Z", - "iopub.status.idle": "2024-03-27T15:14:35.217839Z", - "shell.execute_reply": "2024-03-27T15:14:35.216566Z" + "iopub.execute_input": "2024-05-15T15:58:54.292571Z", + "iopub.status.busy": "2024-05-15T15:58:54.292308Z", + "iopub.status.idle": "2024-05-15T15:58:54.443712Z", + "shell.execute_reply": "2024-05-15T15:58:54.443023Z" } }, "outputs": [ @@ -3617,23 +3591,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -3701,10 +3675,10 @@ "id": "4a857c18-b0d5-48dc-b7f1-1f6389db5089", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:14:35.224083Z", - "iopub.status.busy": "2024-03-27T15:14:35.223599Z", - "iopub.status.idle": "2024-03-27T15:14:35.613965Z", - "shell.execute_reply": "2024-03-27T15:14:35.613373Z" + "iopub.execute_input": "2024-05-15T15:58:54.447134Z", + "iopub.status.busy": "2024-05-15T15:58:54.446857Z", + "iopub.status.idle": "2024-05-15T15:58:54.770678Z", + "shell.execute_reply": "2024-05-15T15:58:54.770024Z" } }, "outputs": [ @@ -3713,23 +3687,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -3797,10 +3771,10 @@ "id": "e1ee24d9-1def-4b8d-bb85-1c63b595e75e", "metadata": { "execution": { - "iopub.execute_input": "2024-03-27T15:14:35.616799Z", - "iopub.status.busy": "2024-03-27T15:14:35.616580Z", - "iopub.status.idle": "2024-03-27T15:14:38.723572Z", - "shell.execute_reply": "2024-03-27T15:14:38.722758Z" + "iopub.execute_input": "2024-05-15T15:58:54.773893Z", + "iopub.status.busy": "2024-05-15T15:58:54.773655Z", + "iopub.status.idle": "2024-05-15T15:58:56.607253Z", + "shell.execute_reply": "2024-05-15T15:58:56.606584Z" } }, "outputs": [ From f393bd5a99fc0c87289a3be0a79220ceb2e607fa Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 19:48:10 +0100 Subject: [PATCH 46/59] fix more notebooks --- .../duckdb/deduplicate_50k_synthetic.ipynb | 3 +- docs/demos/examples/duckdb/transactions.ipynb | 3345 ++++++++--------- .../sqlite/deduplicate_50k_synthetic.ipynb | 2982 ++++++++------- 3 files changed, 3132 insertions(+), 3198 deletions(-) diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb index 3ba8fa66e3..2196162f3a 100644 --- a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb @@ -1409,6 +1409,8 @@ } ], "source": [ + "from IPython.display import IFrame\n", + "\n", "linker.cluster_studio_dashboard(\n", " df_predict,\n", " clusters,\n", @@ -1417,7 +1419,6 @@ " overwrite=True,\n", ")\n", "\n", - "from IPython.display import IFrame\n", "\n", "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" ] diff --git a/docs/demos/examples/duckdb/transactions.ipynb b/docs/demos/examples/duckdb/transactions.ipynb index c4d15729b0..6207cc33ca 100644 --- a/docs/demos/examples/duckdb/transactions.ipynb +++ b/docs/demos/examples/duckdb/transactions.ipynb @@ -1,1739 +1,1696 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking banking transactions\n", - "\n", - "This example shows how to perform a one-to-one link on banking transactions.\n", - "\n", - "The data is fake data, and was generated has the following features:\n", - "\n", - "- Money shows up in the destination account with some time delay\n", - "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n", - "- The memo is sometimes truncated and content is sometimes missing\n", - "\n", - "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n" + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking banking transactions\n", + "\n", + "This example shows how to perform a one-to-one link on banking transactions.\n", + "\n", + "The data is fake data, and was generated has the following features:\n", + "\n", + "- Money shows up in the destination account with some time delay\n", + "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n", + "- The memo is sometimes truncated and content is sometimes missing\n", + "\n", + "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:03.769001Z", + "iopub.status.busy": "2024-05-15T18:47:03.768667Z", + "iopub.status.idle": "2024-05-15T18:47:03.790143Z", + "shell.execute_reply": "2024-05-15T18:47:03.789060Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:03.794648Z", + "iopub.status.busy": "2024-05-15T18:47:03.794219Z", + "iopub.status.idle": "2024-05-15T18:47:05.667707Z", + "shell.execute_reply": "2024-05-15T18:47:05.666879Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C paym2022-03-2836.360
11M CORVINUS dona2022-02-14221.911
\n", + "
" + ], + "text/plain": [ + " ground_truth memo transaction_date amount unique_id\n", + "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n", + "1 1 M CORVINUS dona 2022-02-14 221.91 1" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C payment BGC2022-03-2936.360
11M CORVINUS BGC2022-02-16221.911
\n", + "
" + ], + "text/plain": [ + " ground_truth memo transaction_date amount unique_id\n", + "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n", + "1 1 M CORVINUS BGC 2022-02-16 221.91 1" ] - }, + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n", + "\n", + "df_origin = splink_datasets.transactions_origin\n", + "df_destination = splink_datasets.transactions_destination\n", + "\n", + "display(df_origin.head(2))\n", + "display(df_destination.head(2))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following chart, we can see this is a challenging dataset to link:\n", + "\n", + "- There are only 151 distinct transaction dates, with strong skew\n", + "- Some 'memos' are used multiple times (up to 48 times)\n", + "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:05.710715Z", + "iopub.status.busy": "2024-05-15T18:47:05.710346Z", + "iopub.status.idle": "2024-05-15T18:47:06.445989Z", + "shell.execute_reply": "2024-05-15T18:47:06.445394Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:57.492180Z", - "iopub.status.busy": "2024-03-27T15:15:57.491890Z", - "iopub.status.idle": "2024-03-27T15:15:57.497297Z", - "shell.execute_reply": "2024-03-27T15:15:57.496501Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.exploratory import profile_columns\n", + "\n", + "db_api = DuckDBAPI()\n", + "profile_columns(\n", + " [df_origin, df_destination],\n", + " db_api=db_api,\n", + " column_expressions=[\n", + " \"memo\",\n", + " \"transaction_date\",\n", + " \"amount\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:06.449369Z", + "iopub.status.busy": "2024-05-15T18:47:06.449107Z", + "iopub.status.idle": "2024-05-15T18:47:07.230866Z", + "shell.execute_reply": "2024-05-15T18:47:07.230164Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:57.501346Z", - "iopub.status.busy": "2024-03-27T15:15:57.501027Z", - "iopub.status.idle": "2024-03-27T15:15:59.169970Z", - "shell.execute_reply": "2024-03-27T15:15:59.169298Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C paym2022-03-2836.360
11M CORVINUS dona2022-02-14221.911
\n", - "
" - ], - "text/plain": [ - " ground_truth memo transaction_date amount unique_id\n", - "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n", - "1 1 M CORVINUS dona 2022-02-14 221.91 1" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C payment BGC2022-03-2936.360
11M CORVINUS BGC2022-02-16221.911
\n", - "
" - ], - "text/plain": [ - " ground_truth memo transaction_date amount unique_id\n", - "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n", - "1 1 M CORVINUS BGC 2022-02-16 221.91 1" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "from splink import Linker, DuckDBAPI, block_on, SettingsCreator, splink_datasets\n", - "\n", - "df_origin = splink_datasets.transactions_origin\n", - "df_destination = splink_datasets.transactions_destination\n", - "\n", - "display(df_origin.head(2))\n", - "display(df_destination.head(2))" + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "# Design blocking rules that allow for differences in transaction date and amounts\n", + "blocking_rule_date_1 = \"\"\"\n", + " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", + " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", + " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", + "\"\"\"\n", + "\n", + "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n", + "blocking_rule_date_2 = \"\"\"\n", + " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", + " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", + " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", + "\"\"\"\n", + "\n", + "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n", + "\n", + "blocking_rule_amount_1 = \"\"\"\n", + "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n", + "\"\"\"\n", + "\n", + "blocking_rule_amount_2 = \"\"\"\n", + "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n", + "\"\"\"\n", + "\n", + "blocking_rule_cheat = block_on(\"unique_id\")\n", + "\n", + "\n", + "brs = [\n", + " blocking_rule_date_1,\n", + " blocking_rule_date_2,\n", + " blocking_rule_memo,\n", + " blocking_rule_amount_1,\n", + " blocking_rule_amount_2,\n", + " blocking_rule_cheat,\n", + "]\n", + "\n", + "\n", + "db_api = DuckDBAPI()\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[df_origin, df_destination],\n", + " blocking_rule_creators=brs,\n", + " db_api=db_api,\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"unique_id\",\n", + " source_dataset_column_name=\"source_dataset\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:07.234575Z", + "iopub.status.busy": "2024-05-15T18:47:07.234247Z", + "iopub.status.idle": "2024-05-15T18:47:07.243390Z", + "shell.execute_reply": "2024-05-15T18:47:07.242498Z" + } + }, + "outputs": [], + "source": [ + "# Full settings for linking model\n", + "import splink.comparison_level_library as cll\n", + "import splink.comparison_library as cl\n", + "\n", + "comparison_amount = {\n", + " \"output_column_name\": \"amount\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"amount\"),\n", + " cll.ExactMatchLevel(\"amount\"),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n", + " cll.ElseLevel(),\n", + " ],\n", + " \"comparison_description\": \"Amount percentage difference\",\n", + "}\n", + "\n", + "# The date distance is one sided becaause transactions should only arrive after they've left\n", + "# As a result, the comparison_template_library date difference functions are not appropriate\n", + "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n", + "\n", + "comparison_date = {\n", + " \"output_column_name\": \"transaction_date\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"transaction_date\"),\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=1),\n", + " \"label_for_charts\": \"1 day\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=4),\n", + " \"label_for_charts\": \"<=4 days\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=10),\n", + " \"label_for_charts\": \"<=10 days\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=30),\n", + " \"label_for_charts\": \"<=30 days\",\n", + " },\n", + " cll.ElseLevel(),\n", + " ],\n", + " \"comparison_description\": \"Transaction date days apart\",\n", + "}\n", + "\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " probability_two_random_records_match=1 / len(df_origin),\n", + " blocking_rules_to_generate_predictions=[\n", + " blocking_rule_date_1,\n", + " blocking_rule_date_2,\n", + " blocking_rule_memo,\n", + " blocking_rule_amount_1,\n", + " blocking_rule_amount_2,\n", + " blocking_rule_cheat,\n", + " ],\n", + " comparisons=[\n", + " comparison_amount,\n", + " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n", + " comparison_date,\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:07.247075Z", + "iopub.status.busy": "2024-05-15T18:47:07.246785Z", + "iopub.status.idle": "2024-05-15T18:47:07.377423Z", + "shell.execute_reply": "2024-05-15T18:47:07.376705Z" + } + }, + "outputs": [], + "source": [ + "linker = Linker(\n", + " [df_origin, df_destination],\n", + " settings,\n", + " input_table_aliases=[\"__ori\", \"_dest\"],\n", + " database_api=db_api,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:07.381612Z", + "iopub.status.busy": "2024-05-15T18:47:07.381368Z", + "iopub.status.idle": "2024-05-15T18:47:09.028314Z", + "shell.execute_reply": "2024-05-15T18:47:09.027785Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the following chart, we can see this is a challenging dataset to link:\n", - "\n", - "- There are only 151 distinct transaction dates, with strong skew\n", - "- Some 'memos' are used multiple times (up to 48 times)\n", - "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:15:59.215569Z", - "iopub.status.busy": "2024-03-27T15:15:59.215138Z", - "iopub.status.idle": "2024-03-27T15:16:00.323285Z", - "shell.execute_reply": "2024-03-27T15:16:00.322600Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink.exploratory import profile_columns\n", - "from splink.column_expression import ColumnExpression\n", - "\n", - "db_api = DuckDBAPI()\n", - "profile_columns(\n", - " [df_origin, df_destination],\n", - " db_api=db_api,\n", - " column_expressions=[\n", - " \"memo\",\n", - " \"transaction_date\",\n", - " \"amount\",\n", - " ],\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - amount (no m values are trained).\n", + " - memo (no m values are trained).\n", + " - transaction_date (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.estimate_u_using_random_sampling(max_pairs=1e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:09.031158Z", + "iopub.status.busy": "2024-05-15T18:47:09.030941Z", + "iopub.status.idle": "2024-05-15T18:47:10.248208Z", + "shell.execute_reply": "2024-05-15T18:47:10.247537Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:00.328168Z", - "iopub.status.busy": "2024-03-27T15:16:00.327829Z", - "iopub.status.idle": "2024-03-27T15:16:01.939332Z", - "shell.execute_reply": "2024-03-27T15:16:01.938682Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Design blocking rules that allow for differences in transaction date and amounts\n", - "from splink.blocking_rule_creator import BlockingRuleCreator\n", - "from splink.settings_creator import SettingsCreator\n", - "\n", - "blocking_rule_date_1 = \"\"\"\n", - " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", - " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", - " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", - "\"\"\"\n", - "\n", - "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n", - "blocking_rule_date_2 = \"\"\"\n", - " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", - " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", - " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", - "\"\"\"\n", - "\n", - "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n", - "\n", - "blocking_rule_amount_1 = \"\"\"\n", - "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n", - "\"\"\"\n", - "\n", - "blocking_rule_amount_2 = \"\"\"\n", - "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n", - "\"\"\"\n", - "\n", - "blocking_rule_cheat = block_on(\"unique_id\")\n", - "\n", - "# TODO: This analysis should be possible without a linker\n", - "settings = SettingsCreator(link_type=\"link_only\")\n", - "\n", - "linker = Linker([df_origin, df_destination], settings, database_api=db_api)\n", - "\n", - "brs = [\n", - " blocking_rule_date_1,\n", - " blocking_rule_date_2,\n", - " blocking_rule_memo,\n", - " blocking_rule_amount_1,\n", - " blocking_rule_amount_2,\n", - " blocking_rule_cheat,\n", - "]\n", - "\n", - "linker.cumulative_num_comparisons_from_blocking_rules_chart(brs)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"memo\" = r.\"memo\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - amount\n", + " - transaction_date\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - memo\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:01.943001Z", - "iopub.status.busy": "2024-03-27T15:16:01.942688Z", - "iopub.status.idle": "2024-03-27T15:16:01.950627Z", - "shell.execute_reply": "2024-03-27T15:16:01.949949Z" - } - }, - "outputs": [], - "source": [ - "# Full settings for linking model\n", - "import splink.comparison_library as cl\n", - "import splink.comparison_level_library as cll\n", - "\n", - "comparison_amount = {\n", - " \"output_column_name\": \"amount\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"amount\"),\n", - " cll.ExactMatchLevel(\"amount\"),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n", - " cll.ElseLevel(),\n", - " ],\n", - " \"comparison_description\": \"Amount percentage difference\",\n", - "}\n", - "\n", - "# The date distance is one sided becaause transactions should only arrive after they've left\n", - "# As a result, the comparison_template_library date difference functions are not appropriate\n", - "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n", - "\n", - "comparison_date = {\n", - " \"output_column_name\": \"transaction_date\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"transaction_date\"),\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=1),\n", - " \"label_for_charts\": \"1 day\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=4),\n", - " \"label_for_charts\": \"<=4 days\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=10),\n", - " \"label_for_charts\": \"<=10 days\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=30),\n", - " \"label_for_charts\": \"<=30 days\",\n", - " },\n", - " cll.ElseLevel(),\n", - " ],\n", - " \"comparison_description\": \"Transaction date days apart\",\n", - "}\n", - "\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"link_only\",\n", - " probability_two_random_records_match=1 / len(df_origin),\n", - " blocking_rules_to_generate_predictions=[\n", - " blocking_rule_date_1,\n", - " blocking_rule_date_2,\n", - " blocking_rule_memo,\n", - " blocking_rule_amount_1,\n", - " blocking_rule_amount_2,\n", - " blocking_rule_cheat,\n", - " ],\n", - " comparisons=[\n", - " comparison_amount,\n", - " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n", - " comparison_date,\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:01.953886Z", - "iopub.status.busy": "2024-03-27T15:16:01.953634Z", - "iopub.status.idle": "2024-03-27T15:16:02.031109Z", - "shell.execute_reply": "2024-03-27T15:16:02.030145Z" - } - }, - "outputs": [], - "source": [ - "linker = Linker(\n", - " [df_origin, df_destination],\n", - " settings,\n", - " input_table_aliases=[\"__ori\", \"_dest\"],\n", - " database_api=db_api,\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.575 in the m_probability of amount, level `Exact match on amount`\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:02.035274Z", - "iopub.status.busy": "2024-03-27T15:16:02.034958Z", - "iopub.status.idle": "2024-03-27T15:16:04.885459Z", - "shell.execute_reply": "2024-03-27T15:16:04.884481Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0cb4a943a08a42c7841ca32d466f9eed", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - amount (no m values are trained).\n", - " - memo (no m values are trained).\n", - " - transaction_date (no m values are trained).\n" - ] - } - ], - "source": [ - "linker.estimate_u_using_random_sampling(max_pairs=1e6)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.19 in the m_probability of transaction_date, level `1 day`\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:04.889784Z", - "iopub.status.busy": "2024-03-27T15:16:04.889454Z", - "iopub.status.idle": "2024-03-27T15:16:06.403022Z", - "shell.execute_reply": "2024-03-27T15:16:06.402085Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"memo\" = r.\"memo\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - amount\n", - " - transaction_date\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - memo\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.567 in the m_probability of amount, level `Exact match on amount`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.198 in the m_probability of transaction_date, level `1 day`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0111 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was 0.00239 in the m_probability of transaction_date, level `<=30 days`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.000414 in the m_probability of transaction_date, level `<=30 days`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was -0.000286 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was -0.000259 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was -0.000235 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was -0.000215 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was -0.000197 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was -0.000181 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 12: Largest change in params was -0.000167 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 13: Largest change in params was -0.000155 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 14: Largest change in params was -0.000144 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 15: Largest change in params was -0.000134 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 16: Largest change in params was -0.000125 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 17: Largest change in params was -0.000117 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 18: Largest change in params was -0.000109 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 19: Largest change in params was -0.000103 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 20: Largest change in params was -9.65e-05 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 20 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - memo (no m values are trained).\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0107 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:06.407081Z", - "iopub.status.busy": "2024-03-27T15:16:06.406758Z", - "iopub.status.idle": "2024-03-27T15:16:08.285543Z", - "shell.execute_reply": "2024-03-27T15:16:08.284404Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"amount\" = r.\"amount\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - memo\n", - " - transaction_date\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - amount\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.411 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.0783 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0154 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.00422 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.00368 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was 0.00365 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was 0.00346 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was 0.00313 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was 0.00274 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was 0.00233 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was 0.00192 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 12: Largest change in params was 0.00156 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 13: Largest change in params was 0.00124 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 14: Largest change in params was 0.000974 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 15: Largest change in params was 0.000757 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 16: Largest change in params was 0.000583 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 17: Largest change in params was 0.000447 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 18: Largest change in params was 0.000341 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 19: Largest change in params was 0.000259 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 20: Largest change in params was 0.000196 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 21: Largest change in params was 0.000148 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 22: Largest change in params was 0.000112 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 23: Largest change in params was 8.44e-05 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 23 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" - ] - } - ], - "source": [ - "session = linker.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00232 in the m_probability of transaction_date, level `<=30 days`\n" + ] }, { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:08.290188Z", - "iopub.status.busy": "2024-03-27T15:16:08.289615Z", - "iopub.status.idle": "2024-03-27T15:16:08.751526Z", - "shell.execute_reply": "2024-03-27T15:16:08.750832Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.match_weights_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.000403 in the m_probability of transaction_date, level `<=30 days`\n" + ] }, { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:08.755410Z", - "iopub.status.busy": "2024-03-27T15:16:08.755117Z", - "iopub.status.idle": "2024-03-27T15:16:35.663999Z", - "shell.execute_reply": "2024-03-27T15:16:35.662923Z" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4430006dcc174ff092d96adf68c301ff", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df_predict = linker.predict(threshold_match_probability=0.001)" + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was -0.000267 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.000242 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was -0.00022 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.000202 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was -0.000185 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was -0.000171 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was -0.000158 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was -0.000146 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was -0.000136 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was -0.000127 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was -0.000118 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -0.000111 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was -0.000104 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was -9.76e-05 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 19 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - memo (no m values are trained).\n" + ] + }, + { + "data": { + "text/plain": [ + "" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:10.251174Z", + "iopub.status.busy": "2024-05-15T18:47:10.250961Z", + "iopub.status.idle": "2024-05-15T18:47:11.577879Z", + "shell.execute_reply": "2024-05-15T18:47:11.577248Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"amount\" = r.\"amount\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - memo\n", + " - transaction_date\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - amount\n" + ] }, { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:35.668331Z", - "iopub.status.busy": "2024-03-27T15:16:35.668001Z", - "iopub.status.idle": "2024-03-27T15:16:36.066089Z", - "shell.execute_reply": "2024-03-27T15:16:36.065310Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.381 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.101 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.021 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was -0.00512 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00421 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.00444 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.00442 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.00417 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.00375 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.00323 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 0.00268 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was 0.00216 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was 0.0017 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was 0.00131 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was 0.001 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.000754 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was 0.000564 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was 0.000419 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was 0.00031 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was 0.000229 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 21: Largest change in params was 0.000168 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 22: Largest change in params was 0.000124 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 23: Largest change in params was 9.08e-05 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 23 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "session = linker.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:11.581103Z", + "iopub.status.busy": "2024-05-15T18:47:11.580847Z", + "iopub.status.idle": "2024-05-15T18:47:11.965681Z", + "shell.execute_reply": "2024-05-15T18:47:11.964278Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "linker.comparison_viewer_dashboard(\n", - " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n", - ")\n", - "from IPython.display import IFrame\n", - "\n", - "IFrame(\n", - " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n", - ")" + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:11.970722Z", + "iopub.status.busy": "2024-05-15T18:47:11.970200Z", + "iopub.status.idle": "2024-05-15T18:47:34.855445Z", + "shell.execute_reply": "2024-05-15T18:47:34.854572Z" + } + }, + "outputs": [], + "source": [ + "df_predict = linker.predict(threshold_match_probability=0.001)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:34.859818Z", + "iopub.status.busy": "2024-05-15T18:47:34.859505Z", + "iopub.status.idle": "2024-05-15T18:47:35.227217Z", + "shell.execute_reply": "2024-05-15T18:47:35.226469Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:36.070331Z", - "iopub.status.busy": "2024-03-27T15:16:36.070012Z", - "iopub.status.idle": "2024-03-27T15:16:42.791946Z", - "shell.execute_reply": "2024-03-27T15:16:42.791334Z" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "63719efff46e49ecba53edb438f35c3f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "pred_errors = linker.prediction_errors_from_labels_column(\n", - " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n", - ")\n", - "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" + "text/plain": [ + "" ] - }, + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.comparison_viewer_dashboard(\n", + " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n", + ")\n", + "from IPython.display import IFrame\n", + "\n", + "IFrame(\n", + " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:35.230739Z", + "iopub.status.busy": "2024-05-15T18:47:35.230449Z", + "iopub.status.idle": "2024-05-15T18:47:41.150757Z", + "shell.execute_reply": "2024-05-15T18:47:41.149898Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-27T15:16:42.795039Z", - "iopub.status.busy": "2024-03-27T15:16:42.794805Z", - "iopub.status.idle": "2024-03-27T15:16:43.942284Z", - "shell.execute_reply": "2024-03-27T15:16:43.941682Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "pred_errors = linker.prediction_errors_from_labels_column(\n", - " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n", - ")\n", - "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" + "text/plain": [ + "alt.LayerChart(...)" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "0cb4a943a08a42c7841ca32d466f9eed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "2bae68755fc34e38ac69e792f314ba8e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "4430006dcc174ff092d96adf68c301ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "5c32bb2a7a714bd79accac15915b17e5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "6222247c7cbe45b19cfeb9b182147a18": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "63719efff46e49ecba53edb438f35c3f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "921bb606e07743f7a252c05830098a57": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "ed234594aea94bf98ffb67a51d3811f4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "fd157120a2ca488496c737cec882713d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - } - }, - "version_major": 2, - "version_minor": 0 - } + ], + "source": [ + "pred_errors = linker.prediction_errors_from_labels_column(\n", + " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n", + ")\n", + "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:47:41.154185Z", + "iopub.status.busy": "2024-05-15T18:47:41.153896Z", + "iopub.status.idle": "2024-05-15T18:47:42.083914Z", + "shell.execute_reply": "2024-05-15T18:47:42.083253Z" } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_errors = linker.prediction_errors_from_labels_column(\n", + " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n", + ")\n", + "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" }, - "nbformat": 4, - "nbformat_minor": 4 + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0cb4a943a08a42c7841ca32d466f9eed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "2bae68755fc34e38ac69e792f314ba8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "4430006dcc174ff092d96adf68c301ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "5c32bb2a7a714bd79accac15915b17e5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "6222247c7cbe45b19cfeb9b182147a18": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "63719efff46e49ecba53edb438f35c3f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "921bb606e07743f7a252c05830098a57": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "ed234594aea94bf98ffb67a51d3811f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "fd157120a2ca488496c737cec882713d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb index 1329f5c146..fd8e724734 100644 --- a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb @@ -1,1560 +1,1536 @@ { - "cells": [ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking a dataset of real historical persons\n", + "\n", + "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n", + "\n", + "Note, as explained in the [backends topic guide](https://moj-analytical-services.github.io/splink/topic_guides/backends.html#sqlite), SQLite does not natively support string fuzzy matching functions such as `damareau-levenshtein` and `jaro-winkler` (as used in this example). Instead, these have been imported as python User Defined Functions (UDFs). One drawback of python UDFs is that they are considerably slower than native-SQL comparisons. As such, if you are hitting issues with large run times, consider switching to DuckDB (or some other backend).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:30.610213Z", + "iopub.status.busy": "2024-05-15T18:41:30.609846Z", + "iopub.status.idle": "2024-05-15T18:41:30.615335Z", + "shell.execute_reply": "2024-05-15T18:41:30.614566Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev\n", + "# !pip install rapidfuzz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:30.619046Z", + "iopub.status.busy": "2024-05-15T18:41:30.618760Z", + "iopub.status.idle": "2024-05-15T18:41:31.933775Z", + "shell.execute_reply": "2024-05-15T18:41:31.932989Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from splink import splink_datasets\n", + "\n", + "pd.options.display.max_rows = 1000\n", + "# reduce size of dataset to make things run faster\n", + "df = splink_datasets.historical_50k.sample(5000)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:31.938051Z", + "iopub.status.busy": "2024-05-15T18:41:31.937677Z", + "iopub.status.idle": "2024-05-15T18:41:32.856954Z", + "shell.execute_reply": "2024-05-15T18:41:32.856284Z" + } + }, + "outputs": [ { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking a dataset of real historical persons\n", - "\n", - "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n", - "\n", - "Note, as explained in the [backends topic guide](https://moj-analytical-services.github.io/splink/topic_guides/backends.html#sqlite), SQLite does not natively support string fuzzy matching functions such as `damareau-levenshtein` and `jaro-winkler` (as used in this example). Instead, these have been imported as python User Defined Functions (UDFs). One drawback of python UDFs is that they are considerably slower than native-SQL comparisons. As such, if you are hitting issues with large run times, consider switching to DuckDB (or some other backend).\n" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import SQLiteAPI\n", + "from splink.exploratory import profile_columns\n", + "\n", + "db_api = SQLiteAPI()\n", + "profile_columns(\n", + " df, db_api, column_expressions=[\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:32.900620Z", + "iopub.status.busy": "2024-05-15T18:41:32.900280Z", + "iopub.status.idle": "2024-05-15T18:41:33.193607Z", + "shell.execute_reply": "2024-05-15T18:41:33.192963Z" + } + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "" + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import SQLiteAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules = [block_on(\"first_name\", \"surname\"),\n", + " block_on(\"surname\", \"dob\"),\n", + " block_on(\"first_name\", \"dob\"),\n", + " block_on(\"postcode_fake\", \"first_name\")]\n", + "\n", + "db_api = SQLiteAPI()\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rule_creators=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + " unique_id_column_name=\"unique_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:33.197015Z", + "iopub.status.busy": "2024-05-15T18:41:33.196743Z", + "iopub.status.idle": "2024-05-15T18:41:33.330331Z", + "shell.execute_reply": "2024-05-15T18:41:33.329671Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "from splink import Linker\n", + "\n", + "settings = {\n", + " \"link_type\": \"dedupe_only\",\n", + " \"blocking_rules_to_generate_predictions\": [\n", + " block_on(\"first_name\", \"surname\"),\n", + " block_on(\"surname\", \"dob\"),\n", + " block_on(\"first_name\", \"dob\"),\n", + " block_on(\"postcode_fake\", \"first_name\"),\n", + "\n", + " ],\n", + " \"comparisons\": [\n", + " ctl.NameComparison(\"first_name\", fuzzy_thresholds=[0.9]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " ctl.NameComparison(\"surname\", fuzzy_thresholds=[0.9]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " cl.DamerauLevenshteinAtThresholds(\"dob\", [1, 2]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " cl.DamerauLevenshteinAtThresholds(\"postcode_fake\", [1, 2]),\n", + " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n", + " cl.ExactMatch(\n", + " \"occupation\",\n", + " ).configure(term_frequency_adjustments=True),\n", + " ],\n", + " \"retain_matching_columns\": True,\n", + " \"retain_intermediate_calculation_columns\": True,\n", + " \"max_iterations\": 10,\n", + " \"em_convergence\": 0.01,\n", + "}\n", + "\n", + "linker = Linker(df, settings, database_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:33.334300Z", + "iopub.status.busy": "2024-05-15T18:41:33.333988Z", + "iopub.status.idle": "2024-05-15T18:41:33.488238Z", + "shell.execute_reply": "2024-05-15T18:41:33.487555Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.00013.\n", + "This means that amongst all possible pairwise record comparisons, one in 7,667.18 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 1,630.00 matching pairs\n" + ] + } + ], + "source": [ + "linker.estimate_probability_two_random_records_match(\n", + " [\n", + " \"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob\",\n", + " \"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)\",\n", + " \"l.dob = r.dob and l.postcode_fake = r.postcode_fake\",\n", + " ],\n", + " recall=0.6,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:33.491551Z", + "iopub.status.busy": "2024-05-15T18:41:33.491328Z", + "iopub.status.idle": "2024-05-15T18:41:41.469753Z", + "shell.execute_reply": "2024-05-15T18:41:41.469157Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:48.035745Z", - "iopub.status.busy": "2024-03-13T12:31:48.035437Z", - "iopub.status.idle": "2024-03-13T12:31:48.040474Z", - "shell.execute_reply": "2024-03-13T12:31:48.039811Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev\n", - "# !pip install rapidfuzz" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] }, { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:48.044908Z", - "iopub.status.busy": "2024-03-13T12:31:48.044547Z", - "iopub.status.idle": "2024-03-13T12:31:50.250368Z", - "shell.execute_reply": "2024-03-13T12:31:50.249579Z" - } - }, - "outputs": [], - "source": [ - "from splink import splink_datasets\n", - "import pandas as pd\n", - "\n", - "pd.options.display.max_rows = 1000\n", - "# reduce size of dataset to make things run faster\n", - "df = splink_datasets.historical_50k.sample(5000)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - dob (no m values are trained).\n", + " - postcode_fake (no m values are trained).\n", + " - birth_place (no m values are trained).\n", + " - occupation (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.estimate_u_using_random_sampling(max_pairs=1e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:41.473301Z", + "iopub.status.busy": "2024-05-15T18:41:41.473009Z", + "iopub.status.idle": "2024-05-15T18:41:41.683463Z", + "shell.execute_reply": "2024-05-15T18:41:41.682843Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:50.254706Z", - "iopub.status.busy": "2024-03-13T12:31:50.254231Z", - "iopub.status.idle": "2024-03-13T12:31:52.256000Z", - "shell.execute_reply": "2024-03-13T12:31:52.255447Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import SQLiteAPI\n", - "from splink.exploratory import profile_columns\n", - "from splink.column_expression import ColumnExpression\n", - "\n", - "db_api = SQLiteAPI()\n", - "profile_columns(\n", - " df, db_api, column_expressions=[\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"]\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.first_name = r.first_name and l.surname = r.surname\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - dob\n", + " - postcode_fake\n", + " - birth_place\n", + " - occupation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - first_name\n", + " - surname\n" + ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:52.321773Z", - "iopub.status.busy": "2024-03-13T12:31:52.321420Z", - "iopub.status.idle": "2024-03-13T12:31:52.655731Z", - "shell.execute_reply": "2024-03-13T12:31:52.655062Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import Linker, block_on\n", - "\n", - "# Simple settings dictionary will be used for exploratory analysis\n", - "settings = {\n", - " \"link_type\": \"dedupe_only\",\n", - " \"blocking_rules_to_generate_predictions\": [\n", - " block_on(\"first_name\", \"surname\"),\n", - " block_on(\"surname\", \"dob\"),\n", - " block_on(\"first_name\", \"dob\"),\n", - " block_on(\"postcode_fake\", \"first_name\"),\n", - " ],\n", - "}\n", - "\n", - "# TODO: this shouldn't need the linker\n", - "linker = Linker(df, settings, database_api=db_api)\n", - "linker.cumulative_num_comparisons_from_blocking_rules_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] }, { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:52.659287Z", - "iopub.status.busy": "2024-03-13T12:31:52.659005Z", - "iopub.status.idle": "2024-03-13T12:31:52.734506Z", - "shell.execute_reply": "2024-03-13T12:31:52.733725Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_template_library as ctl\n", - "import splink.comparison_library as cl\n", - "\n", - "settings = {\n", - " \"link_type\": \"dedupe_only\",\n", - " \"blocking_rules_to_generate_predictions\": [\n", - " \"l.first_name = r.first_name and l.surname = r.surname\",\n", - " \"l.surname = r.surname and l.dob = r.dob\",\n", - " \"l.first_name = r.first_name and l.dob = r.dob\",\n", - " \"l.postcode_fake = r.postcode_fake and l.first_name = r.first_name\",\n", - " ],\n", - " \"comparisons\": [\n", - " ctl.NameComparison(\"first_name\", fuzzy_thresholds=[0.9]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " ctl.NameComparison(\"surname\", fuzzy_thresholds=[0.9]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " cl.DamerauLevenshteinAtThresholds(\"dob\", [1, 2]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " cl.DamerauLevenshteinAtThresholds(\"postcode_fake\", [1, 2]),\n", - " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n", - " cl.ExactMatch(\n", - " \"occupation\",\n", - " ).configure(term_frequency_adjustments=True),\n", - " ],\n", - " \"retain_matching_columns\": True,\n", - " \"retain_intermediate_calculation_columns\": True,\n", - " \"max_iterations\": 10,\n", - " \"em_convergence\": 0.01,\n", - "}\n", - "\n", - "linker = Linker(df, settings, database_api=db_api)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.435 in probability_two_random_records_match\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:52.738345Z", - "iopub.status.busy": "2024-03-13T12:31:52.738054Z", - "iopub.status.idle": "2024-03-13T12:31:52.848715Z", - "shell.execute_reply": "2024-03-13T12:31:52.848067Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.000152.\n", - "This means that amongst all possible pairwise record comparisons, one in 6,583.41 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 1,898.33 matching pairs\n" - ] - } - ], - "source": [ - "linker.estimate_probability_two_random_records_match(\n", - " [\n", - " \"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob\",\n", - " \"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)\",\n", - " \"l.dob = r.dob and l.postcode_fake = r.postcode_fake\",\n", - " ],\n", - " recall=0.6,\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0222 in probability_two_random_records_match\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:31:52.852034Z", - "iopub.status.busy": "2024-03-13T12:31:52.851791Z", - "iopub.status.idle": "2024-03-13T12:32:02.326714Z", - "shell.execute_reply": "2024-03-13T12:32:02.326100Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - first_name (no m values are trained).\n", - " - surname (no m values are trained).\n", - " - dob (no m values are trained).\n", - " - postcode_fake (no m values are trained).\n", - " - birth_place (no m values are trained).\n", - " - occupation (no m values are trained).\n" - ] - } - ], - "source": [ - "linker.estimate_u_using_random_sampling(max_pairs=1e6)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.00727 in the m_probability of birth_place, level `All other comparisons`\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:02.329724Z", - "iopub.status.busy": "2024-03-13T12:32:02.329494Z", - "iopub.status.idle": "2024-03-13T12:32:02.513190Z", - "shell.execute_reply": "2024-03-13T12:32:02.512437Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.first_name = r.first_name and l.surname = r.surname\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - dob\n", - " - postcode_fake\n", - " - birth_place\n", - " - occupation\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - first_name\n", - " - surname\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.619 in probability_two_random_records_match\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was 0.0367 in the m_probability of postcode_fake, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0135 in the m_probability of birth_place, level `Exact match on birth_place`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.0102 in the m_probability of dob, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was -0.00873 in the m_probability of dob, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 5 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - first_name (no m values are trained).\n", - " - surname (no m values are trained).\n" - ] - } - ], - "source": [ - "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n", - "training_session_names = linker.estimate_parameters_using_expectation_maximisation(\n", - " training_blocking_rule, estimate_without_term_frequencies=True\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:02.516699Z", - "iopub.status.busy": "2024-03-13T12:32:02.516402Z", - "iopub.status.idle": "2024-03-13T12:32:02.769110Z", - "shell.execute_reply": "2024-03-13T12:32:02.768582Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.dob = r.dob\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - first_name\n", - " - surname\n", - " - postcode_fake\n", - " - birth_place\n", - " - occupation\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - dob\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.322 in the m_probability of first_name, level `Exact match on first_name`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.05 in the m_probability of first_name, level `Exact match on first_name`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was -0.0123 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.0037 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 4 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" - ] - } - ], - "source": [ - "training_blocking_rule = \"l.dob = r.dob\"\n", - "training_session_dob = linker.estimate_parameters_using_expectation_maximisation(\n", - " training_blocking_rule, estimate_without_term_frequencies=True\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n" + ] + } + ], + "source": [ + "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n", + "training_session_names = linker.estimate_parameters_using_expectation_maximisation(\n", + " training_blocking_rule, estimate_without_term_frequencies=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:41.686951Z", + "iopub.status.busy": "2024-05-15T18:41:41.686683Z", + "iopub.status.idle": "2024-05-15T18:41:41.926273Z", + "shell.execute_reply": "2024-05-15T18:41:41.925689Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The final match weights can be viewed in the match weights chart:\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.dob = r.dob\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - postcode_fake\n", + " - birth_place\n", + " - occupation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - dob\n" + ] }, { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:02.772061Z", - "iopub.status.busy": "2024-03-13T12:32:02.771838Z", - "iopub.status.idle": "2024-03-13T12:32:04.081397Z", - "shell.execute_reply": "2024-03-13T12:32:04.080657Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.match_weights_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] }, { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:04.085290Z", - "iopub.status.busy": "2024-03-13T12:32:04.084995Z", - "iopub.status.idle": "2024-03-13T12:32:05.265484Z", - "shell.execute_reply": "2024-03-13T12:32:05.264842Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.unlinkables_chart()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.315 in the m_probability of first_name, level `Exact match on first_name`\n" + ] }, { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:05.269049Z", - "iopub.status.busy": "2024-03-13T12:32:05.268753Z", - "iopub.status.idle": "2024-03-13T12:32:06.298192Z", - "shell.execute_reply": "2024-03-13T12:32:06.297429Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...bf_birth_placebf_tf_adj_birth_placeoccupation_loccupation_rgamma_occupationtf_occupation_ltf_occupation_rbf_occupationbf_tf_adj_occupationmatch_key
028.6430561.000000Q7793150-1Q7793150-2thomasthomas20.0260310.02603143.024717...146.6415182.377262NoneNone-1NaNNaN1.000001.0000000
115.3542680.999976Q7793150-1Q7793150-6thomasthomas20.0260310.02603143.024717...146.6415182.377262NoneNone-1NaNNaN1.000001.0000000
228.3724121.000000Q8012283-4Q8012283-9williamwilliam20.0574690.05746943.024717...146.6415182.641403Nonesongwriter-1NaN0.0035831.000001.0000000
339.0992841.000000Q1349901-1Q1349901-3henryhenry20.0274330.02743343.024717...146.6415185.943156ornithologistornithologist10.0047770.00477722.650138.4040610
437.1635851.000000Q1349901-1Q1349901-7henryhenry20.0274330.02743343.024717...146.6415185.943156ornithologistornithologist10.0047770.00477722.650138.4040610
\n", - "

5 rows × 44 columns

\n", - "
" - ], - "text/plain": [ - " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", - "0 28.643056 1.000000 Q7793150-1 Q7793150-2 thomas \n", - "1 15.354268 0.999976 Q7793150-1 Q7793150-6 thomas \n", - "2 28.372412 1.000000 Q8012283-4 Q8012283-9 william \n", - "3 39.099284 1.000000 Q1349901-1 Q1349901-3 henry \n", - "4 37.163585 1.000000 Q1349901-1 Q1349901-7 henry \n", - "\n", - " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", - "0 thomas 2 0.026031 0.026031 \n", - "1 thomas 2 0.026031 0.026031 \n", - "2 william 2 0.057469 0.057469 \n", - "3 henry 2 0.027433 0.027433 \n", - "4 henry 2 0.027433 0.027433 \n", - "\n", - " bf_first_name ... bf_birth_place bf_tf_adj_birth_place occupation_l \\\n", - "0 43.024717 ... 146.641518 2.377262 None \n", - "1 43.024717 ... 146.641518 2.377262 None \n", - "2 43.024717 ... 146.641518 2.641403 None \n", - "3 43.024717 ... 146.641518 5.943156 ornithologist \n", - "4 43.024717 ... 146.641518 5.943156 ornithologist \n", - "\n", - " occupation_r gamma_occupation tf_occupation_l tf_occupation_r \\\n", - "0 None -1 NaN NaN \n", - "1 None -1 NaN NaN \n", - "2 songwriter -1 NaN 0.003583 \n", - "3 ornithologist 1 0.004777 0.004777 \n", - "4 ornithologist 1 0.004777 0.004777 \n", - "\n", - " bf_occupation bf_tf_adj_occupation match_key \n", - "0 1.00000 1.000000 0 \n", - "1 1.00000 1.000000 0 \n", - "2 1.00000 1.000000 0 \n", - "3 22.65013 8.404061 0 \n", - "4 22.65013 8.404061 0 \n", - "\n", - "[5 rows x 44 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_predict = linker.predict()\n", - "df_e = df_predict.as_pandas_dataframe(limit=5)\n", - "df_e" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0462 in the m_probability of first_name, level `Exact match on first_name`\n" + ] }, { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also view rows in this dataset as a waterfall chart as follows:\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.0104 in the m_probability of surname, level `Exact match on surname`\n" + ] }, { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:06.301986Z", - "iopub.status.busy": "2024-03-13T12:32:06.301703Z", - "iopub.status.idle": "2024-03-13T12:32:08.131340Z", - "shell.execute_reply": "2024-03-13T12:32:08.130808Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was -0.00254 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 4 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = \"l.dob = r.dob\"\n", + "training_session_dob = linker.estimate_parameters_using_expectation_maximisation(\n", + " training_blocking_rule, estimate_without_term_frequencies=True\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final match weights can be viewed in the match weights chart:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:41.929306Z", + "iopub.status.busy": "2024-05-15T18:41:41.929078Z", + "iopub.status.idle": "2024-05-15T18:41:42.230106Z", + "shell.execute_reply": "2024-05-15T18:41:42.229484Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "from splink.charts import waterfall_chart\n", - "\n", - "records_to_plot = df_e.to_dict(orient=\"records\")\n", - "linker.waterfall_chart(records_to_plot, filter_nulls=False)" + "text/plain": [ + "alt.VConcatChart(...)" ] - }, + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:42.233172Z", + "iopub.status.busy": "2024-05-15T18:41:42.232933Z", + "iopub.status.idle": "2024-05-15T18:41:42.813828Z", + "shell.execute_reply": "2024-05-15T18:41:42.813043Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:08.134481Z", - "iopub.status.busy": "2024-03-13T12:32:08.134216Z", - "iopub.status.idle": "2024-03-13T12:32:08.396765Z", - "shell.execute_reply": "2024-03-13T12:32:08.396033Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 1, root rows count 4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 2, root rows count 1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 3, root rows count 1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 4, root rows count 0\n" - ] - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "clusters = linker.cluster_pairwise_predictions_at_threshold(\n", - " df_predict, threshold_match_probability=0.95\n", - ")" + "text/plain": [ + "alt.LayerChart(...)" ] - }, + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.unlinkables_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:42.817975Z", + "iopub.status.busy": "2024-05-15T18:41:42.817397Z", + "iopub.status.idle": "2024-05-15T18:41:43.292311Z", + "shell.execute_reply": "2024-05-15T18:41:43.291620Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:08.401232Z", - "iopub.status.busy": "2024-03-13T12:32:08.400923Z", - "iopub.status.idle": "2024-03-13T12:32:08.484416Z", - "shell.execute_reply": "2024-03-13T12:32:08.483862Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...bf_birth_placebf_tf_adj_birth_placeoccupation_loccupation_rgamma_occupationtf_occupation_ltf_occupation_rbf_occupationbf_tf_adj_occupationmatch_key
030.7379211.000000Q6139106-4Q6139106-5jamesjames20.0262260.02622643.192839...125.2665812.594855NoneNone-1NaNNaN1.0000001.0000000
1-10.7284430.000589Q5545144-4Q608545-9georgegeorge20.0320320.03203243.192839...0.1732831.000000rugby union playerentomologist00.0134560.0019220.0844151.0000000
235.6066361.000000Q4888351-1Q4888351-2benjaminbenjamin20.0032030.00320343.192839...125.2665814.757234rugby union playerrugby union player10.0134560.01345624.4444132.7931460
335.6066361.000000Q4888351-1Q4888351-4benjaminbenjamin20.0032030.00320343.192839...125.2665814.757234rugby union playerrugby union player10.0134560.01345624.4444132.7931460
436.8668101.000000Q1293322-1Q1293322-4edwardedward20.0184180.01841843.192839...125.2665812.195647priestpriest10.0246060.02460624.4444131.5275020
\n", + "

5 rows × 44 columns

\n", + "
" ], - "source": [ - "linker.cluster_studio_dashboard(\n", - " df_predict,\n", - " clusters,\n", - " \"dashboards/50k_cluster.html\",\n", - " sampling_method=\"by_cluster_size\",\n", - " overwrite=True,\n", - ")\n", - "\n", - "from IPython.display import IFrame\n", - "\n", - "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" + "text/plain": [ + " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", + "0 30.737921 1.000000 Q6139106-4 Q6139106-5 james \n", + "1 -10.728443 0.000589 Q5545144-4 Q608545-9 george \n", + "2 35.606636 1.000000 Q4888351-1 Q4888351-2 benjamin \n", + "3 35.606636 1.000000 Q4888351-1 Q4888351-4 benjamin \n", + "4 36.866810 1.000000 Q1293322-1 Q1293322-4 edward \n", + "\n", + " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", + "0 james 2 0.026226 0.026226 \n", + "1 george 2 0.032032 0.032032 \n", + "2 benjamin 2 0.003203 0.003203 \n", + "3 benjamin 2 0.003203 0.003203 \n", + "4 edward 2 0.018418 0.018418 \n", + "\n", + " bf_first_name ... bf_birth_place bf_tf_adj_birth_place \\\n", + "0 43.192839 ... 125.266581 2.594855 \n", + "1 43.192839 ... 0.173283 1.000000 \n", + "2 43.192839 ... 125.266581 4.757234 \n", + "3 43.192839 ... 125.266581 4.757234 \n", + "4 43.192839 ... 125.266581 2.195647 \n", + "\n", + " occupation_l occupation_r gamma_occupation tf_occupation_l \\\n", + "0 None None -1 NaN \n", + "1 rugby union player entomologist 0 0.013456 \n", + "2 rugby union player rugby union player 1 0.013456 \n", + "3 rugby union player rugby union player 1 0.013456 \n", + "4 priest priest 1 0.024606 \n", + "\n", + " tf_occupation_r bf_occupation bf_tf_adj_occupation match_key \n", + "0 NaN 1.000000 1.000000 0 \n", + "1 0.001922 0.084415 1.000000 0 \n", + "2 0.013456 24.444413 2.793146 0 \n", + "3 0.013456 24.444413 2.793146 0 \n", + "4 0.024606 24.444413 1.527502 0 \n", + "\n", + "[5 rows x 44 columns]" ] - }, + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_predict = linker.predict()\n", + "df_e = df_predict.as_pandas_dataframe(limit=5)\n", + "df_e" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also view rows in this dataset as a waterfall chart as follows:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:43.296030Z", + "iopub.status.busy": "2024-05-15T18:41:43.295753Z", + "iopub.status.idle": "2024-05-15T18:41:43.969119Z", + "shell.execute_reply": "2024-05-15T18:41:43.968521Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:08.487725Z", - "iopub.status.busy": "2024-03-13T12:32:08.487457Z", - "iopub.status.idle": "2024-03-13T12:32:15.290258Z", - "shell.execute_reply": "2024-03-13T12:32:15.289427Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "linker.roc_chart_from_labels_column(\"cluster\", match_weight_round_to_nearest=0.02)" + "text/plain": [ + "alt.LayerChart(...)" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "records_to_plot = df_e.to_dict(orient=\"records\")\n", + "linker.waterfall_chart(records_to_plot, filter_nulls=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:43.972219Z", + "iopub.status.busy": "2024-05-15T18:41:43.971787Z", + "iopub.status.idle": "2024-05-15T18:41:44.116709Z", + "shell.execute_reply": "2024-05-15T18:41:44.115993Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 4\n" + ] }, { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:15.311973Z", - "iopub.status.busy": "2024-03-13T12:32:15.311623Z", - "iopub.status.idle": "2024-03-13T12:32:17.921687Z", - "shell.execute_reply": "2024-03-13T12:32:17.920816Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 2, root rows count 0\n" + ] + } + ], + "source": [ + "clusters = linker.cluster_pairwise_predictions_at_threshold(\n", + " df_predict, threshold_match_probability=0.95\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:44.120162Z", + "iopub.status.busy": "2024-05-15T18:41:44.119922Z", + "iopub.status.idle": "2024-05-15T18:41:44.180152Z", + "shell.execute_reply": "2024-05-15T18:41:44.179445Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "records = linker.prediction_errors_from_labels_column(\n", - " \"cluster\",\n", - " threshold=0.999,\n", - " include_false_negatives=False,\n", - " include_false_positives=True,\n", - ").as_record_dict()\n", - "linker.waterfall_chart(records)" + "text/plain": [ + "" ] - }, + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.cluster_studio_dashboard(\n", + " df_predict,\n", + " clusters,\n", + " \"dashboards/50k_cluster.html\",\n", + " sampling_method=\"by_cluster_size\",\n", + " overwrite=True,\n", + ")\n", + "\n", + "from IPython.display import IFrame\n", + "\n", + "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:44.184020Z", + "iopub.status.busy": "2024-05-15T18:41:44.183710Z", + "iopub.status.idle": "2024-05-15T18:41:46.543532Z", + "shell.execute_reply": "2024-05-15T18:41:46.542614Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-13T12:32:17.925704Z", - "iopub.status.busy": "2024-03-13T12:32:17.925400Z", - "iopub.status.idle": "2024-03-13T12:32:22.800322Z", - "shell.execute_reply": "2024-03-13T12:32:22.799605Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" ], - "source": [ - "# Some of the false negatives will be because they weren't detected by the blocking rules\n", - "records = linker.prediction_errors_from_labels_column(\n", - " \"cluster\",\n", - " threshold=0.5,\n", - " include_false_negatives=True,\n", - " include_false_positives=False,\n", - ").as_record_dict(limit=50)\n", - "\n", - "linker.waterfall_chart(records)" + "text/plain": [ + "alt.Chart(...)" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" + ], + "source": [ + "linker.roc_chart_from_labels_column(\"cluster\", match_weight_round_to_nearest=0.02)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:46.557696Z", + "iopub.status.busy": "2024-05-15T18:41:46.557395Z", + "iopub.status.idle": "2024-05-15T18:41:47.295019Z", + "shell.execute_reply": "2024-05-15T18:41:47.294474Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "records = linker.prediction_errors_from_labels_column(\n", + " \"cluster\",\n", + " threshold=0.999,\n", + " include_false_negatives=False,\n", + " include_false_positives=True,\n", + ").as_record_dict()\n", + "linker.waterfall_chart(records)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:47.298555Z", + "iopub.status.busy": "2024-05-15T18:41:47.298310Z", + "iopub.status.idle": "2024-05-15T18:41:50.039196Z", + "shell.execute_reply": "2024-05-15T18:41:50.038400Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "# Some of the false negatives will be because they weren't detected by the blocking rules\n", + "records = linker.prediction_errors_from_labels_column(\n", + " \"cluster\",\n", + " threshold=0.5,\n", + " include_false_negatives=True,\n", + " include_false_positives=False,\n", + ").as_record_dict(limit=50)\n", + "\n", + "linker.waterfall_chart(records)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From 4b1d0090b8edca40ef6c77fc2a48ea3a39a64711 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 15 May 2024 20:19:51 +0100 Subject: [PATCH 47/59] check source dataset works as intended --- tests/test_analyse_blocking.py | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index b6b760063c..1c9c6c4d33 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -126,6 +126,53 @@ def validate_blocking_output(comparison_count_args, expected_out): assert expected_out["cartesian"] == records[0]["cartesian"] +@mark_with_dialects_excluding() +def test_source_dataset_works_as_expected(test_helpers, dialect): + helper = test_helpers[dialect] + df_1 = pd.DataFrame( + [ + {"unique_id": 1, "first_name": "John", "surname": "Smith"}, + {"unique_id": 2, "first_name": "Mary", "surname": "Jones"}, + {"unique_id": 3, "first_name": "Jane", "surname": "Taylor"}, + {"unique_id": 4, "first_name": "John", "surname": "Brown"}, + ] + ) + + df_2 = pd.DataFrame( + [ + {"unique_id": 1, "first_name": "John", "surname": "Smyth"}, + {"unique_id": 2, "first_name": "Mary", "surname": "Jones"}, + {"unique_id": 3, "first_name": "Jayne", "surname": "Tailor"}, + ] + ) + df_1["src_dataset"] = "df_1" + df_2["src_dataset"] = "df_2" + df_concat = pd.concat([df_1.copy(), df_2.copy()]) + df_1.drop(columns=["src_dataset"], inplace=True) + df_2.drop(columns=["src_dataset"], inplace=True) + + db_api = helper.DatabaseAPI(**helper.db_api_args()) + + r1 = cumulative_comparisons_to_be_scored_from_blocking_rules_data( + table_or_tables=df_concat, + blocking_rule_creators=[block_on("first_name")], + db_api=db_api, + unique_id_column_name="unique_id", + source_dataset_column_name="src_dataset", + link_type="link_only", + ) + + r2 = cumulative_comparisons_to_be_scored_from_blocking_rules_data( + table_or_tables=[df_1, df_2], + blocking_rule_creators=[block_on("first_name")], + db_api=db_api, + unique_id_column_name="unique_id", + link_type="link_only", + source_dataset_column_name="source_dataset", + ) + assert r1.to_dict(orient="records") == r2.to_dict(orient="records") + + @mark_with_dialects_excluding() def test_blocking_records_accuracy(test_helpers, dialect): from numpy import nan From 1899563c987c1e885aa6ed381747cc66cf362009 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 07:49:39 +0100 Subject: [PATCH 48/59] deal with different intput types, including single table with source dataset column name --- splink/internals/blocking_analysis.py | 125 ++++++++++++++++++++------ 1 file changed, 96 insertions(+), 29 deletions(-) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index 025ccafe6c..4ba02a1165 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Any, Dict, Iterable, List, Optional, Sequence, Union +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union import pandas as pd import sqlglot @@ -36,6 +36,7 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( link_type: str, db_api: DatabaseAPISubClass, unique_id_column_name: str, + source_dataset_column_name: Optional[str], ) -> list[dict[str, str]]: input_dataframes = list(input_data_dict.values()) @@ -43,18 +44,27 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( if two_dataset_link_only: link_type = "two_dataset_link_only" - if len(input_dataframes) > 1 and not two_dataset_link_only: - unique_id_cols = [ - InputColumn("source_dataset", sql_dialect=db_api.sql_dialect.name), - InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), - ] + source_dataset_input_column, unique_id_input_column = _process_unique_id_columns( + unique_id_column_name, + source_dataset_column_name, + input_data_dict, + link_type, + db_api.sql_dialect.name, + ) + if source_dataset_input_column: + unique_id_cols = [source_dataset_input_column, unique_id_input_column] else: - unique_id_cols = [ - InputColumn(unique_id_column_name, sql_dialect=db_api.sql_dialect.name), - ] + unique_id_cols = [unique_id_input_column] where_condition = _sql_gen_where_condition(link_type, unique_id_cols) + # If it's a link_only or link_and_dedupe and no source_dataset_column_name is + # provided, it will have been set to a default by _process_unique_id_columns + if source_dataset_input_column is None: + source_dataset_column_name = None + else: + source_dataset_column_name = source_dataset_input_column.name + sqls = [] if two_dataset_link_only: @@ -62,7 +72,9 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( input_tablename_r = input_dataframes[1].physical_name else: sql = vertically_concatenate_sql( - input_data_dict, salting_required=False, source_dataset_column_name=None + input_data_dict, + salting_required=False, + source_dataset_column_name=source_dataset_column_name, ) sqls.append({"sql": sql, "output_table_name": "__splink__df_concat"}) @@ -218,6 +230,61 @@ def _row_counts_per_input_table( return db_api.sql_pipeline_to_splink_dataframe(pipeline) +def _process_unique_id_columns( + unique_id_column_name: str, + source_dataset_column_name: Optional[str], + splink_df_dict: dict[str, "SplinkDataFrame"], + link_type: user_input_link_type_options, + sql_dialect_name: str, +) -> Tuple[Optional[InputColumn], InputColumn]: + # Various options: + # In the dedupe_only case we do need a source dataset column. If it is provided, + # retain it. (it'll probably be ignored, but does no harm) + # + # link_only, link_and_dedupe cases: The user may have provided a single input + # table, in which case their input table must contain the source dataset column + # + # If the user provides n tables, then we can create the source dataset column + # for them a default name + + if link_type == "dedupe_only": + if source_dataset_column_name is None: + return [ + None, + InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), + ] + else: + return [ + InputColumn(source_dataset_column_name, sql_dialect=sql_dialect_name), + InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), + ] + + if link_type in ("link_only", "link_and_dedupe") and len(splink_df_dict) == 1: + # get first iem in splink_df_dict + df = next(iter(splink_df_dict.values())) + cols = df.columns + if source_dataset_column_name not in [col.unquote().name for col in cols]: + raise ValueError( + "You have provided a single input table with link type 'link_only' or " + "'link_and_dedupe'. You provided a source_dataset_column_name of " + f"'{source_dataset_column_name}'.\nThis column was not found " + "in the input data, so Splink does not know how to split your input " + "data into multiple tables.\n Either provide multiple input datasets, " + "or create a source dataset column name in your input table" + ) + + if source_dataset_column_name is None: + return [ + InputColumn("source_dataset", sql_dialect=sql_dialect_name), + InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), + ] + else: + return [ + InputColumn(source_dataset_column_name, sql_dialect=sql_dialect_name), + InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), + ] + + def _cumulative_comparisons_to_be_scored_from_blocking_rules( *, splink_df_dict: dict[str, "SplinkDataFrame"], @@ -228,23 +295,13 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( unique_id_column_name: str, source_dataset_column_name: Optional[str], ) -> pd.DataFrame: - unique_id_input_column = InputColumn( - unique_id_column_name, sql_dialect=db_api.sql_dialect.name + source_dataset_input_column, unique_id_input_column = _process_unique_id_columns( + unique_id_column_name, + source_dataset_column_name, + splink_df_dict, + link_type, + db_api.sql_dialect.name, ) - if link_type == "dedupe_only": - source_dataset_input_column = None - input_columns = [unique_id_input_column] - else: - if source_dataset_column_name is not None: - source_dataset_input_column = InputColumn( - source_dataset_column_name, sql_dialect=db_api.sql_dialect.name - ) - input_columns = [unique_id_input_column, source_dataset_input_column] - else: - raise ValueError( - "source_dataset_column_name cannot be None " - "for link_type other than 'dedupe_only'" - ) # Check none of the blocking rules will create a vast/computationally # intractable number of comparisons @@ -258,6 +315,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( max_rows_limit=max_rows_limit, compute_post_filter_count=False, unique_id_column_name=unique_id_column_name, + source_dataset_column_name=source_dataset_column_name, ) count_pre_filter = count[ "number_of_comparisons_generated_pre_filter_conditions" @@ -276,7 +334,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( rc = _row_counts_per_input_table( splink_df_dict=splink_df_dict, link_type=link_type, - source_dataset_column_name=source_dataset_column_name, + source_dataset_column_name=source_dataset_input_column.name, db_api=db_api, ).as_record_dict() @@ -304,6 +362,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( pipeline.enqueue_sql(sql, "__splink__df_concat") + input_columns = [source_dataset_input_column, unique_id_input_column] sql_select_expr = ",".join( [item for c in input_columns for item in c.l_r_names_as_l_r] ) @@ -397,7 +456,8 @@ def _count_comparisons_generated_from_blocking_rule( db_api: DatabaseAPISubClass, compute_post_filter_count: bool, max_rows_limit: int = int(1e9), - unique_id_column_name: str = "unique_id", + unique_id_column_name: str, + source_dataset_column_name: Optional[str], ) -> dict[str, Union[int, str]]: # TODO: if it's an exploding blocking rule, make sure we error out pipeline = CTEPipeline() @@ -440,7 +500,12 @@ def add_l_r(sql, table_name): if pre_filter_total < max_rows_limit: pipeline = CTEPipeline() sqls = _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( - splink_df_dict, blocking_rule, link_type, db_api, unique_id_column_name + splink_df_dict, + blocking_rule, + link_type, + db_api, + unique_id_column_name, + source_dataset_column_name, ) pipeline.enqueue_list_of_sqls(sqls) post_filter_total_df = db_api.sql_pipeline_to_splink_dataframe(pipeline) @@ -476,6 +541,7 @@ def count_comparisons_from_blocking_rule( link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, + source_dataset_column_name: Optional[str] = None, compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), ) -> dict[str, Union[int, str]]: @@ -493,6 +559,7 @@ def count_comparisons_from_blocking_rule( compute_post_filter_count=compute_post_filter_count, max_rows_limit=max_rows_limit, unique_id_column_name=unique_id_column_name, + source_dataset_column_name=source_dataset_column_name, ) From 95a92ff6f74fda6419b5b95add7c5c103e0a1975 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 12:35:13 +0100 Subject: [PATCH 49/59] fix bugs introduced by none inputcolumn --- splink/internals/blocking_analysis.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index 4ba02a1165..9807fb4b51 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -203,11 +203,16 @@ def _row_counts_per_input_table( *, splink_df_dict: dict[str, "SplinkDataFrame"], link_type: backend_link_type_options, - source_dataset_column_name: Optional[str], + source_dataset_input_column: Optional[InputColumn], db_api: DatabaseAPISubClass, ) -> "SplinkDataFrame": pipeline = CTEPipeline() + if source_dataset_input_column: + source_dataset_column_name = source_dataset_input_column.name + else: + source_dataset_column_name = None + sql = vertically_concatenate_sql( splink_df_dict, salting_required=False, @@ -334,7 +339,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( rc = _row_counts_per_input_table( splink_df_dict=splink_df_dict, link_type=link_type, - source_dataset_column_name=source_dataset_input_column.name, + source_dataset_input_column=source_dataset_input_column, db_api=db_api, ).as_record_dict() @@ -364,7 +369,7 @@ def _cumulative_comparisons_to_be_scored_from_blocking_rules( input_columns = [source_dataset_input_column, unique_id_input_column] sql_select_expr = ",".join( - [item for c in input_columns for item in c.l_r_names_as_l_r] + [item for c in input_columns if c is not None for item in c.l_r_names_as_l_r] ) blocking_input_tablename_l = "__splink__df_concat" From 01b8d6e01437ee76bb2d3cd2a56907fe769b42d1 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 12:42:59 +0100 Subject: [PATCH 50/59] add further tests --- tests/test_analyse_blocking.py | 72 ++++++++++++++++++++++++++++++++++ tests/test_new_db_api.py | 1 - 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 1c9c6c4d33..874c90fa42 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -172,6 +172,78 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): ) assert r1.to_dict(orient="records") == r2.to_dict(orient="records") + df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv") + df_1 = df[df["unique_id"] % 3 == 0].copy() + df_1["sds"] = "df_1_name" + df_2 = df[df["unique_id"] % 3 == 1].copy() + df_2["sds"] = "df_2_name" + df_3 = df[df["unique_id"] % 3 == 2].copy() + df_3["sds"] = "df_3_name" + + df_concat_2 = pd.concat([df_1, df_2]) + df_concat_3 = pd.concat([df_1, df_2, df_3]) + + df_1_no_sds = df[df["unique_id"] % 3 == 0].copy() + df_2_no_sds = df[df["unique_id"] % 3 == 1].copy() + df_3_no_sds = df[df["unique_id"] % 3 == 2].copy() + + count_comparisons_from_blocking_rule( + table_or_tables=df_concat_3, + blocking_rule_creator=block_on("first_name"), + link_type="dedupe_only", + unique_id_column_name="unique_id", + db_api=db_api, + ) + + r1 = count_comparisons_from_blocking_rule( + table_or_tables=df_concat_3, + blocking_rule_creator=block_on("first_name"), + link_type="link_only", + db_api=db_api, + unique_id_column_name="unique_id", + source_dataset_column_name="sds", + ) + + r2 = count_comparisons_from_blocking_rule( + table_or_tables=[df_1_no_sds, df_2_no_sds, df_3_no_sds], + blocking_rule_creator=block_on("first_name"), + link_type="link_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) + # Both of the above use the vertical concat of the two datasets so should + # be equivalent + assert r1 == r2 + + r1 = count_comparisons_from_blocking_rule( + table_or_tables=df_concat_2, + blocking_rule_creator=block_on("first_name"), + link_type="link_only", + db_api=db_api, + unique_id_column_name="unique_id", + source_dataset_column_name="sds", + ) + + r2 = count_comparisons_from_blocking_rule( + table_or_tables=[df_1_no_sds, df_2_no_sds], + blocking_rule_creator=block_on("first_name"), + link_type="link_only", + db_api=db_api, + unique_id_column_name="unique_id", + ) + # There's an optimisation in the case of two input dataframes only + # so these are not the same + assert ( + r1["number_of_comparisons_generated_pre_filter_conditions"] + > r2["number_of_comparisons_generated_pre_filter_conditions"] + ) + + # But after filters, should be the same + assert ( + r1["number_of_comparisons_to_be_scored_post_filter_conditions"] + == r2["number_of_comparisons_to_be_scored_post_filter_conditions"] + ) + @mark_with_dialects_excluding() def test_blocking_records_accuracy(test_helpers, dialect): diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py index 09e0e022b1..4cb96b7866 100644 --- a/tests/test_new_db_api.py +++ b/tests/test_new_db_api.py @@ -122,7 +122,6 @@ def test_charts(dialect, test_helpers, tmp_path): link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", - source_dataset_column_name="source_dataset", ) linker = Linker(df, cl_settings, db_api) From ef7fdd8a3845e127e60b774e25db0472e85c44b0 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 12:50:58 +0100 Subject: [PATCH 51/59] fix tests --- scripts/reduce_notebook_runtime.py | 15 +++++++++++++-- splink/internals/blocking_analysis.py | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/reduce_notebook_runtime.py b/scripts/reduce_notebook_runtime.py index 7bdb4b4dc2..dff19e7c05 100644 --- a/scripts/reduce_notebook_runtime.py +++ b/scripts/reduce_notebook_runtime.py @@ -19,6 +19,13 @@ def modify_notebook(file_path): data["cells"] = data["cells"][:19] changed = True + if "sqlite" in file_path: + max_pairs = 3e5 + head_num = 800 + else: + max_pairs = 1e5 + head_num = 400 + for cell in data["cells"]: if cell["cell_type"] == "code": source = cell["source"] @@ -26,13 +33,17 @@ def modify_notebook(file_path): for line in source: if "splink_datasets" in line and "=" in line: parts = line.split("=") - parts[1] = parts[1].strip() + ".head(400)" + parts[1] = parts[1].strip() + f".head({head_num})" new_line = " = ".join(parts) + "\n" new_source.append(new_line) changed = True elif "estimate_u_using_random_sampling(" in line: new_line = ( - re.sub(r"max_pairs=\d+(\.\d+)?[eE]\d+", "max_pairs=1e5", line) + re.sub( + r"max_pairs=\d+(\.\d+)?[eE]\d+", + f"max_pairs={max_pairs}", + line, + ) + "\n" ) new_source.append(new_line) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index 9807fb4b51..a1e47a0ae2 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -545,7 +545,7 @@ def count_comparisons_from_blocking_rule( blocking_rule_creator: Union[BlockingRuleCreator, str, Dict[str, Any]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, - unique_id_column_name: str, + unique_id_column_name: str = "unqiue_id", source_dataset_column_name: Optional[str] = None, compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), @@ -574,7 +574,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, - unique_id_column_name: str, + unique_id_column_name: str = "unique_id", max_rows_limit: int = int(1e9), source_dataset_column_name: Optional[str] = None, ) -> pd.DataFrame: From acd5074301ee1164b2e90276d338827c793bec08 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 12:59:39 +0100 Subject: [PATCH 52/59] return type is typle --- splink/internals/blocking_analysis.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index a1e47a0ae2..e7b4e29444 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -239,7 +239,7 @@ def _process_unique_id_columns( unique_id_column_name: str, source_dataset_column_name: Optional[str], splink_df_dict: dict[str, "SplinkDataFrame"], - link_type: user_input_link_type_options, + link_type: backend_link_type_options, sql_dialect_name: str, ) -> Tuple[Optional[InputColumn], InputColumn]: # Various options: @@ -254,24 +254,24 @@ def _process_unique_id_columns( if link_type == "dedupe_only": if source_dataset_column_name is None: - return [ + return ( None, InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), - ] + ) else: - return [ + return ( InputColumn(source_dataset_column_name, sql_dialect=sql_dialect_name), InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), - ] + ) if link_type in ("link_only", "link_and_dedupe") and len(splink_df_dict) == 1: - # get first iem in splink_df_dict + # Get first item in splink_df_dict df = next(iter(splink_df_dict.values())) cols = df.columns if source_dataset_column_name not in [col.unquote().name for col in cols]: raise ValueError( "You have provided a single input table with link type 'link_only' or " - "'link_and_dedupe'. You provided a source_dataset_column_name of " + "'link_and_dedupe'. You provided a source_dataset_column_name of " f"'{source_dataset_column_name}'.\nThis column was not found " "in the input data, so Splink does not know how to split your input " "data into multiple tables.\n Either provide multiple input datasets, " @@ -279,15 +279,15 @@ def _process_unique_id_columns( ) if source_dataset_column_name is None: - return [ + return ( InputColumn("source_dataset", sql_dialect=sql_dialect_name), InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), - ] + ) else: - return [ + return ( InputColumn(source_dataset_column_name, sql_dialect=sql_dialect_name), InputColumn(unique_id_column_name, sql_dialect=sql_dialect_name), - ] + ) def _cumulative_comparisons_to_be_scored_from_blocking_rules( @@ -545,7 +545,7 @@ def count_comparisons_from_blocking_rule( blocking_rule_creator: Union[BlockingRuleCreator, str, Dict[str, Any]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, - unique_id_column_name: str = "unqiue_id", + unique_id_column_name: str = "unique_id", source_dataset_column_name: Optional[str] = None, compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), From 062b0f0922b7d0b47852021c6bc629c351f38435 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 13:04:02 +0100 Subject: [PATCH 53/59] mypy passes --- splink/find_brs_with_comparison_counts_below_threshold.py | 2 ++ splink/internals/blocking_analysis.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index f1ad8c1751..b5a4711dee 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -165,6 +165,8 @@ def _search_tree_for_blocking_rules_below_threshold_count( link_type=linker._settings_obj._link_type, db_api=linker.db_api, compute_post_filter_count=False, + source_dataset_column_name=linker._settings_obj.column_info_settings.source_dataset_column_name, + unique_id_column_name=linker._settings_obj.column_info_settings.unique_id_column_name, )["number_of_comparisons_generated_pre_filter_conditions"] already_visited.add(frozenset(current_combination)) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index e7b4e29444..f2a9dcd26f 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -33,7 +33,7 @@ def _number_of_comparisons_generated_by_blocking_rule_post_filters_sqls( input_data_dict: dict[str, "SplinkDataFrame"], blocking_rule: "BlockingRule", - link_type: str, + link_type: backend_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str, source_dataset_column_name: Optional[str], From 4861c7e2602c4d952d24e53d6c78dcdaeff8376e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 13:14:35 +0100 Subject: [PATCH 54/59] update notebooks --- .../duckdb/deduplicate_50k_synthetic.ipynb | 1 - .../duckdb/deterministic_dedupe.ipynb | 1 - docs/demos/examples/duckdb/transactions.ipynb | 301 ++++++++---------- .../sqlite/deduplicate_50k_synthetic.ipynb | 3 +- docs/demos/tutorials/03_Blocking.ipynb | 76 ++--- splink/internals/blocking_analysis.py | 2 +- 6 files changed, 174 insertions(+), 210 deletions(-) diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb index 2196162f3a..0e800cf16c 100644 --- a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb @@ -400,7 +400,6 @@ " blocking_rule_creators=blocking_rules,\n", " db_api=db_api,\n", " link_type=\"dedupe_only\",\n", - " unique_id_column_name=\"unique_id\",\n", ")" ] }, diff --git a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb index e47bbfec51..8442c1a7ed 100644 --- a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb +++ b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb @@ -324,7 +324,6 @@ " ],\n", " db_api=db_api,\n", " link_type=\"dedupe_only\",\n", - " unique_id_column_name=\"unique_id\",\n", ")" ] }, diff --git a/docs/demos/examples/duckdb/transactions.ipynb b/docs/demos/examples/duckdb/transactions.ipynb index 6207cc33ca..118aa373e3 100644 --- a/docs/demos/examples/duckdb/transactions.ipynb +++ b/docs/demos/examples/duckdb/transactions.ipynb @@ -32,10 +32,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:03.769001Z", - "iopub.status.busy": "2024-05-15T18:47:03.768667Z", - "iopub.status.idle": "2024-05-15T18:47:03.790143Z", - "shell.execute_reply": "2024-05-15T18:47:03.789060Z" + "iopub.execute_input": "2024-05-16T12:13:14.252200Z", + "iopub.status.busy": "2024-05-16T12:13:14.251497Z", + "iopub.status.idle": "2024-05-16T12:13:14.257616Z", + "shell.execute_reply": "2024-05-16T12:13:14.256908Z" } }, "outputs": [], @@ -49,10 +49,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:03.794648Z", - "iopub.status.busy": "2024-05-15T18:47:03.794219Z", - "iopub.status.idle": "2024-05-15T18:47:05.667707Z", - "shell.execute_reply": "2024-05-15T18:47:05.666879Z" + "iopub.execute_input": "2024-05-16T12:13:14.261383Z", + "iopub.status.busy": "2024-05-16T12:13:14.261079Z", + "iopub.status.idle": "2024-05-16T12:13:16.084252Z", + "shell.execute_reply": "2024-05-16T12:13:16.083429Z" } }, "outputs": [ @@ -200,10 +200,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:05.710715Z", - "iopub.status.busy": "2024-05-15T18:47:05.710346Z", - "iopub.status.idle": "2024-05-15T18:47:06.445989Z", - "shell.execute_reply": "2024-05-15T18:47:06.445394Z" + "iopub.execute_input": "2024-05-16T12:13:16.143823Z", + "iopub.status.busy": "2024-05-16T12:13:16.143431Z", + "iopub.status.idle": "2024-05-16T12:13:16.849535Z", + "shell.execute_reply": "2024-05-16T12:13:16.848871Z" } }, "outputs": [ @@ -212,23 +212,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -306,10 +306,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:06.449369Z", - "iopub.status.busy": "2024-05-15T18:47:06.449107Z", - "iopub.status.idle": "2024-05-15T18:47:07.230866Z", - "shell.execute_reply": "2024-05-15T18:47:07.230164Z" + "iopub.execute_input": "2024-05-16T12:13:16.852855Z", + "iopub.status.busy": "2024-05-16T12:13:16.852594Z", + "iopub.status.idle": "2024-05-16T12:13:18.407824Z", + "shell.execute_reply": "2024-05-16T12:13:18.407265Z" } }, "outputs": [ @@ -318,23 +318,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -441,9 +441,7 @@ " table_or_tables=[df_origin, df_destination],\n", " blocking_rule_creators=brs,\n", " db_api=db_api,\n", - " link_type=\"link_only\",\n", - " unique_id_column_name=\"unique_id\",\n", - " source_dataset_column_name=\"source_dataset\",\n", + " link_type=\"link_only\"\n", ")" ] }, @@ -452,10 +450,10 @@ "execution_count": 5, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:07.234575Z", - "iopub.status.busy": "2024-05-15T18:47:07.234247Z", - "iopub.status.idle": "2024-05-15T18:47:07.243390Z", - "shell.execute_reply": "2024-05-15T18:47:07.242498Z" + "iopub.execute_input": "2024-05-16T12:13:18.411066Z", + "iopub.status.busy": "2024-05-16T12:13:18.410832Z", + "iopub.status.idle": "2024-05-16T12:13:18.418094Z", + "shell.execute_reply": "2024-05-16T12:13:18.416984Z" } }, "outputs": [], @@ -533,10 +531,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:07.247075Z", - "iopub.status.busy": "2024-05-15T18:47:07.246785Z", - "iopub.status.idle": "2024-05-15T18:47:07.377423Z", - "shell.execute_reply": "2024-05-15T18:47:07.376705Z" + "iopub.execute_input": "2024-05-16T12:13:18.421517Z", + "iopub.status.busy": "2024-05-16T12:13:18.421286Z", + "iopub.status.idle": "2024-05-16T12:13:18.552970Z", + "shell.execute_reply": "2024-05-16T12:13:18.552184Z" } }, "outputs": [], @@ -554,10 +552,10 @@ "execution_count": 7, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:07.381612Z", - "iopub.status.busy": "2024-05-15T18:47:07.381368Z", - "iopub.status.idle": "2024-05-15T18:47:09.028314Z", - "shell.execute_reply": "2024-05-15T18:47:09.027785Z" + "iopub.execute_input": "2024-05-16T12:13:18.556284Z", + "iopub.status.busy": "2024-05-16T12:13:18.556053Z", + "iopub.status.idle": "2024-05-16T12:13:20.529952Z", + "shell.execute_reply": "2024-05-16T12:13:20.529065Z" } }, "outputs": [ @@ -597,10 +595,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:09.031158Z", - "iopub.status.busy": "2024-05-15T18:47:09.030941Z", - "iopub.status.idle": "2024-05-15T18:47:10.248208Z", - "shell.execute_reply": "2024-05-15T18:47:10.247537Z" + "iopub.execute_input": "2024-05-16T12:13:20.532832Z", + "iopub.status.busy": "2024-05-16T12:13:20.532606Z", + "iopub.status.idle": "2024-05-16T12:13:21.867808Z", + "shell.execute_reply": "2024-05-16T12:13:21.867084Z" } }, "outputs": [ @@ -639,133 +637,119 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was -0.575 in the m_probability of amount, level `Exact match on amount`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.19 in the m_probability of transaction_date, level `1 day`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0107 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" + "Iteration 1: Largest change in params was -0.58 in the m_probability of amount, level `Exact match on amount`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 4: Largest change in params was 0.00232 in the m_probability of transaction_date, level `<=30 days`\n" + "Iteration 2: Largest change in params was -0.185 in the m_probability of transaction_date, level `1 day`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 5: Largest change in params was 0.000403 in the m_probability of transaction_date, level `<=30 days`\n" + "Iteration 3: Largest change in params was 0.0104 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 6: Largest change in params was -0.000267 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 4: Largest change in params was 0.00228 in the m_probability of transaction_date, level `<=30 days`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 7: Largest change in params was -0.000242 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 5: Largest change in params was 0.000398 in the m_probability of transaction_date, level `<=30 days`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 8: Largest change in params was -0.00022 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 6: Largest change in params was -0.00024 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 9: Largest change in params was -0.000202 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 7: Largest change in params was -0.000218 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 10: Largest change in params was -0.000185 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 8: Largest change in params was -0.000199 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 11: Largest change in params was -0.000171 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 9: Largest change in params was -0.000182 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 12: Largest change in params was -0.000158 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 10: Largest change in params was -0.000167 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 13: Largest change in params was -0.000146 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 11: Largest change in params was -0.000154 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 14: Largest change in params was -0.000136 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 12: Largest change in params was -0.000142 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 15: Largest change in params was -0.000127 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 13: Largest change in params was -0.000132 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 16: Largest change in params was -0.000118 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 14: Largest change in params was -0.000122 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 17: Largest change in params was -0.000111 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 15: Largest change in params was -0.000114 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 18: Largest change in params was -0.000104 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 16: Largest change in params was -0.000106 in the m_probability of amount, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 19: Largest change in params was -9.76e-05 in the m_probability of amount, level `All other comparisons`\n" + "Iteration 17: Largest change in params was -9.92e-05 in the m_probability of amount, level `All other comparisons`\n" ] }, { @@ -773,7 +757,7 @@ "output_type": "stream", "text": [ "\n", - "EM converged after 19 iterations\n" + "EM converged after 17 iterations\n" ] }, { @@ -805,10 +789,10 @@ "execution_count": 9, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:10.251174Z", - "iopub.status.busy": "2024-05-15T18:47:10.250961Z", - "iopub.status.idle": "2024-05-15T18:47:11.577879Z", - "shell.execute_reply": "2024-05-15T18:47:11.577248Z" + "iopub.execute_input": "2024-05-16T12:13:21.871283Z", + "iopub.status.busy": "2024-05-16T12:13:21.871004Z", + "iopub.status.idle": "2024-05-16T12:13:23.094606Z", + "shell.execute_reply": "2024-05-16T12:13:23.093838Z" } }, "outputs": [ @@ -847,161 +831,140 @@ "name": "stderr", "output_type": "stream", "text": [ - "Iteration 1: Largest change in params was -0.381 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.101 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.021 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.00512 in the m_probability of memo, level `Exact match on memo`\n" + "Iteration 1: Largest change in params was -0.435 in the m_probability of memo, level `Exact match on memo`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 5: Largest change in params was 0.00421 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 2: Largest change in params was -0.0613 in the m_probability of memo, level `Exact match on memo`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 6: Largest change in params was 0.00444 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 3: Largest change in params was 0.0114 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 7: Largest change in params was 0.00442 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 4: Largest change in params was 0.00446 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 8: Largest change in params was 0.00417 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 5: Largest change in params was 0.00436 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 9: Largest change in params was 0.00375 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 6: Largest change in params was 0.00398 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 10: Largest change in params was 0.00323 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 7: Largest change in params was 0.00345 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 11: Largest change in params was 0.00268 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 8: Largest change in params was 0.00287 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 12: Largest change in params was 0.00216 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 9: Largest change in params was 0.00231 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 13: Largest change in params was 0.0017 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 10: Largest change in params was 0.00181 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 14: Largest change in params was 0.00131 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 11: Largest change in params was 0.00139 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 15: Largest change in params was 0.001 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 12: Largest change in params was 0.00105 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 16: Largest change in params was 0.000754 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 13: Largest change in params was 0.000789 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 17: Largest change in params was 0.000564 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 14: Largest change in params was 0.000586 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 18: Largest change in params was 0.000419 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 15: Largest change in params was 0.000433 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 19: Largest change in params was 0.00031 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 16: Largest change in params was 0.000318 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 20: Largest change in params was 0.000229 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 17: Largest change in params was 0.000233 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 21: Largest change in params was 0.000168 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 18: Largest change in params was 0.00017 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 22: Largest change in params was 0.000124 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 19: Largest change in params was 0.000124 in the m_probability of memo, level `All other comparisons`\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Iteration 23: Largest change in params was 9.08e-05 in the m_probability of memo, level `All other comparisons`\n" + "Iteration 20: Largest change in params was 9.05e-05 in the m_probability of memo, level `All other comparisons`\n" ] }, { @@ -1009,7 +972,7 @@ "output_type": "stream", "text": [ "\n", - "EM converged after 23 iterations\n" + "EM converged after 20 iterations\n" ] }, { @@ -1030,10 +993,10 @@ "execution_count": 10, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:11.581103Z", - "iopub.status.busy": "2024-05-15T18:47:11.580847Z", - "iopub.status.idle": "2024-05-15T18:47:11.965681Z", - "shell.execute_reply": "2024-05-15T18:47:11.964278Z" + "iopub.execute_input": "2024-05-16T12:13:23.097922Z", + "iopub.status.busy": "2024-05-16T12:13:23.097670Z", + "iopub.status.idle": "2024-05-16T12:13:23.382589Z", + "shell.execute_reply": "2024-05-16T12:13:23.382014Z" } }, "outputs": [ @@ -1042,23 +1005,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1125,10 +1088,10 @@ "execution_count": 11, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:11.970722Z", - "iopub.status.busy": "2024-05-15T18:47:11.970200Z", - "iopub.status.idle": "2024-05-15T18:47:34.855445Z", - "shell.execute_reply": "2024-05-15T18:47:34.854572Z" + "iopub.execute_input": "2024-05-16T12:13:23.385651Z", + "iopub.status.busy": "2024-05-16T12:13:23.385430Z", + "iopub.status.idle": "2024-05-16T12:13:47.966948Z", + "shell.execute_reply": "2024-05-16T12:13:47.966113Z" } }, "outputs": [], @@ -1141,10 +1104,10 @@ "execution_count": 12, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:34.859818Z", - "iopub.status.busy": "2024-05-15T18:47:34.859505Z", - "iopub.status.idle": "2024-05-15T18:47:35.227217Z", - "shell.execute_reply": "2024-05-15T18:47:35.226469Z" + "iopub.execute_input": "2024-05-16T12:13:47.970901Z", + "iopub.status.busy": "2024-05-16T12:13:47.970603Z", + "iopub.status.idle": "2024-05-16T12:13:48.365220Z", + "shell.execute_reply": "2024-05-16T12:13:48.364442Z" } }, "outputs": [ @@ -1163,7 +1126,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 12, @@ -1187,10 +1150,10 @@ "execution_count": 13, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:35.230739Z", - "iopub.status.busy": "2024-05-15T18:47:35.230449Z", - "iopub.status.idle": "2024-05-15T18:47:41.150757Z", - "shell.execute_reply": "2024-05-15T18:47:41.149898Z" + "iopub.execute_input": "2024-05-16T12:13:48.369330Z", + "iopub.status.busy": "2024-05-16T12:13:48.369001Z", + "iopub.status.idle": "2024-05-16T12:13:54.043730Z", + "shell.execute_reply": "2024-05-16T12:13:54.043073Z" } }, "outputs": [ @@ -1199,23 +1162,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1285,10 +1248,10 @@ "execution_count": 14, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T18:47:41.154185Z", - "iopub.status.busy": "2024-05-15T18:47:41.153896Z", - "iopub.status.idle": "2024-05-15T18:47:42.083914Z", - "shell.execute_reply": "2024-05-15T18:47:42.083253Z" + "iopub.execute_input": "2024-05-16T12:13:54.047308Z", + "iopub.status.busy": "2024-05-16T12:13:54.047030Z", + "iopub.status.idle": "2024-05-16T12:13:54.884355Z", + "shell.execute_reply": "2024-05-16T12:13:54.883814Z" } }, "outputs": [ @@ -1297,23 +1260,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ diff --git a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb index fd8e724734..898f9b4af5 100644 --- a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb @@ -270,8 +270,7 @@ " table_or_tables=df,\n", " blocking_rule_creators=blocking_rules,\n", " db_api=db_api,\n", - " link_type=\"dedupe_only\",\n", - " unique_id_column_name=\"unique_id\",\n", + " link_type=\"dedupe_only\"\n", ")" ] }, diff --git a/docs/demos/tutorials/03_Blocking.ipynb b/docs/demos/tutorials/03_Blocking.ipynb index 6f9fee7bb2..18b22638bd 100644 --- a/docs/demos/tutorials/03_Blocking.ipynb +++ b/docs/demos/tutorials/03_Blocking.ipynb @@ -124,10 +124,10 @@ "execution_count": 1, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:33:06.396495Z", - "iopub.status.busy": "2024-05-15T15:33:06.396182Z", - "iopub.status.idle": "2024-05-15T15:33:06.401561Z", - "shell.execute_reply": "2024-05-15T15:33:06.400904Z" + "iopub.execute_input": "2024-05-16T12:14:10.776394Z", + "iopub.status.busy": "2024-05-16T12:14:10.776043Z", + "iopub.status.idle": "2024-05-16T12:14:10.781556Z", + "shell.execute_reply": "2024-05-16T12:14:10.780845Z" } }, "outputs": [], @@ -141,10 +141,10 @@ "execution_count": 2, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:33:06.405358Z", - "iopub.status.busy": "2024-05-15T15:33:06.405054Z", - "iopub.status.idle": "2024-05-15T15:33:08.329315Z", - "shell.execute_reply": "2024-05-15T15:33:08.328603Z" + "iopub.execute_input": "2024-05-16T12:14:10.785735Z", + "iopub.status.busy": "2024-05-16T12:14:10.785460Z", + "iopub.status.idle": "2024-05-16T12:14:12.763325Z", + "shell.execute_reply": "2024-05-16T12:14:12.762406Z" }, "tags": [] }, @@ -172,10 +172,10 @@ "execution_count": 3, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:33:08.333777Z", - "iopub.status.busy": "2024-05-15T15:33:08.333405Z", - "iopub.status.idle": "2024-05-15T15:33:08.644698Z", - "shell.execute_reply": "2024-05-15T15:33:08.643999Z" + "iopub.execute_input": "2024-05-16T12:14:12.767657Z", + "iopub.status.busy": "2024-05-16T12:14:12.767348Z", + "iopub.status.idle": "2024-05-16T12:14:13.144051Z", + "shell.execute_reply": "2024-05-16T12:14:13.143363Z" }, "tags": [] }, @@ -191,7 +191,13 @@ "---\n", "{'number_of_comparisons_generated_pre_filter_conditions': 2153, 'number_of_comparisons_to_be_scored_post_filter_conditions': 682, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"email\" = r.\"email\"'}\n", "---\n", - "{'number_of_comparisons_generated_pre_filter_conditions': 1304, 'number_of_comparisons_to_be_scored_post_filter_conditions': 315, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"city\" = r.\"city\" AND l.\"first_name\" = r.\"first_name\"'}\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 1304, 'number_of_comparisons_to_be_scored_post_filter_conditions': 315, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"city\" = r.\"city\" AND l.\"first_name\" = r.\"first_name\"'}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "---\n", "{'number_of_comparisons_generated_pre_filter_conditions': 4827, 'number_of_comparisons_to_be_scored_post_filter_conditions': 372, 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2', 'equi_join_conditions_identified': 'l.first_name = r.first_name'}\n" ] @@ -216,7 +222,6 @@ " blocking_rule_creator=br,\n", " link_type=\"dedupe_only\",\n", " db_api=db_api,\n", - " unique_id_column_name=\"unique_id\",\n", " )\n", " print(\"---\")\n", " print(counts)" @@ -249,10 +254,10 @@ "execution_count": 4, "metadata": { "execution": { - "iopub.execute_input": "2024-05-15T15:33:08.687383Z", - "iopub.status.busy": "2024-05-15T15:33:08.687063Z", - "iopub.status.idle": "2024-05-15T15:33:08.916238Z", - "shell.execute_reply": "2024-05-15T15:33:08.915508Z" + "iopub.execute_input": "2024-05-16T12:14:13.150066Z", + "iopub.status.busy": "2024-05-16T12:14:13.149747Z", + "iopub.status.idle": "2024-05-16T12:14:13.396698Z", + "shell.execute_reply": "2024-05-16T12:14:13.395979Z" }, "tags": [] }, @@ -262,23 +267,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index f2a9dcd26f..9fe0fb238a 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -606,7 +606,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, - unique_id_column_name: str, + unique_id_column_name: str = "unique_id", max_rows_limit: int = int(1e9), source_dataset_column_name: Optional[str] = None, ) -> ChartReturnType: From 4910795c6020387d58fcdd84090d0134ca21d6e2 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 13:20:53 +0100 Subject: [PATCH 55/59] ensure is iterable --- splink/internals/blocking_analysis.py | 6 +++++- splink/linker.py | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index 9fe0fb238a..e5a6cd46c0 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -19,7 +19,7 @@ from ..charts import ChartReturnType, cumulative_blocking_rule_comparisons_generated from ..database_api import AcceptableInputTableType, DatabaseAPISubClass from ..input_column import InputColumn -from ..misc import calculate_cartesian +from ..misc import calculate_cartesian, ensure_is_iterable from ..pipeline import CTEPipeline from ..splink_dataframe import SplinkDataFrame from ..vertically_concatenate import ( @@ -580,6 +580,8 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( ) -> pd.DataFrame: splink_df_dict = db_api.register_multiple_tables(table_or_tables) + blocking_rule_creators = ensure_is_iterable(blocking_rule_creators) + blocking_rules: List[BlockingRule] = [] for br in blocking_rule_creators: if isinstance(br, BlockingRule): @@ -612,6 +614,8 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( ) -> ChartReturnType: splink_df_dict = db_api.register_multiple_tables(table_or_tables) + blocking_rule_creators = ensure_is_iterable(blocking_rule_creators) + blocking_rules: List[BlockingRule] = [] for br in blocking_rule_creators: if isinstance(br, BlockingRule): diff --git a/splink/linker.py b/splink/linker.py index 0c589cb26f..94c489e3c5 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -75,6 +75,7 @@ from .misc import ( ascii_uid, bayes_factor_to_prob, + ensure_is_iterable, ensure_is_list, prob_to_bayes_factor, ) @@ -2820,6 +2821,8 @@ def estimate_probability_two_random_records_match( f"Estimated recall must be greater than 0 " f"and no more than 1. Supplied value {recall}." ) from None + + deterministic_matching_rules = ensure_is_iterable(deterministic_matching_rules) blocking_rules: List[BlockingRule] = [] for br in deterministic_matching_rules: blocking_rules.append( From 4afc9e2b2908de7081b92bed7eaceccd8980e1f6 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 14:01:54 +0100 Subject: [PATCH 56/59] add where filter condition to output --- splink/blocking.py | 4 +++- splink/internals/blocking_analysis.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/splink/blocking.py b/splink/blocking.py index 4dfbb359cf..11853fae08 100644 --- a/splink/blocking.py +++ b/splink/blocking.py @@ -535,7 +535,9 @@ def materialise_exploded_id_tables( return exploding_blocking_rules -def _sql_gen_where_condition(link_type, unique_id_cols): +def _sql_gen_where_condition( + link_type: backend_link_type_options, unique_id_cols: List[InputColumn] +) -> str: id_expr_l = _composite_unique_id_from_nodes_sql(unique_id_cols, "l") id_expr_r = _composite_unique_id_from_nodes_sql(unique_id_cols, "r") diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index e5a6cd46c0..e9312be7c0 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -494,12 +494,28 @@ def add_l_r(sql, table_name): if filter_conditions == "TRUE": filter_conditions = "" + source_dataset_input_column, unique_id_input_column = _process_unique_id_columns( + unique_id_column_name, + source_dataset_column_name, + splink_df_dict, + link_type, + db_api.sql_dialect.name, + ) + + if source_dataset_input_column: + uid_for_where = [source_dataset_input_column, unique_id_input_column] + else: + uid_for_where = [unique_id_input_column] + + join_condition_sql = _sql_gen_where_condition(link_type, uid_for_where) + if not compute_post_filter_count: return { "number_of_comparisons_generated_pre_filter_conditions": pre_filter_total, "number_of_comparisons_to_be_scored_post_filter_conditions": "not computed", "filter_conditions_identified": filter_conditions, "equi_join_conditions_identified": equi_join_conditions_joined, + "inner_join_condition_identified": join_condition_sql, } if pre_filter_total < max_rows_limit: @@ -536,6 +552,7 @@ def add_l_r(sql, table_name): "number_of_comparisons_to_be_scored_post_filter_conditions": post_filter_total, "filter_conditions_identified": filter_conditions, "equi_join_conditions_identified": equi_join_conditions_joined, + "inner_join_condition_identified": join_condition_sql, } From b362b765d0bf63ab708f3c962e920f1e7c48c299 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 14:04:25 +0100 Subject: [PATCH 57/59] fix tests --- tests/test_analyse_blocking.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 874c90fa42..54999e11a9 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -213,7 +213,12 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): ) # Both of the above use the vertical concat of the two datasets so should # be equivalent - assert r1 == r2 + keys_to_check = [ + "number_of_comparisons_generated_pre_filter_conditions", + "number_of_comparisons_to_be_scored_post_filter_conditions", + ] + for k in keys_to_check: + assert r1[k] == r2[k] r1 = count_comparisons_from_blocking_rule( table_or_tables=df_concat_2, From 8290dfbbea9ec7c3d8b3bd43c8f54d2a034b9d66 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 14:20:57 +0100 Subject: [PATCH 58/59] rename api --- .../duckdb/deterministic_dedupe.ipynb | 2 +- docs/demos/examples/duckdb/febrl3.ipynb | 2 +- docs/demos/examples/duckdb/febrl4.ipynb | 8018 ++++++++--------- docs/demos/examples/duckdb/transactions.ipynb | 3228 +++---- .../sqlite/deduplicate_50k_synthetic.ipynb | 2958 +++--- docs/demos/tutorials/03_Blocking.ipynb | 1000 +- splink/internals/blocking_analysis.py | 53 +- tests/test_analyse_blocking.py | 64 +- tests/test_full_example_deterministic_link.py | 2 +- tests/test_full_example_duckdb.py | 2 +- tests/test_full_example_postgres.py | 4 +- tests/test_new_db_api.py | 2 +- tests/test_total_comparison_count.py | 2 +- 13 files changed, 7667 insertions(+), 7670 deletions(-) diff --git a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb index 8442c1a7ed..494330d616 100644 --- a/docs/demos/examples/duckdb/deterministic_dedupe.ipynb +++ b/docs/demos/examples/duckdb/deterministic_dedupe.ipynb @@ -317,7 +317,7 @@ "db_api = DuckDBAPI()\n", "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", " table_or_tables=df,\n", - " blocking_rule_creators=[\n", + " blocking_rules=[\n", " block_on(\"first_name\", \"surname\", \"dob\"),\n", " block_on(\"surname\", \"dob\", \"postcode_fake\"),\n", " block_on(\"first_name\", \"dob\", \"occupation\"),\n", diff --git a/docs/demos/examples/duckdb/febrl3.ipynb b/docs/demos/examples/duckdb/febrl3.ipynb index ae12e01cbd..a9b2f6a4ac 100644 --- a/docs/demos/examples/duckdb/febrl3.ipynb +++ b/docs/demos/examples/duckdb/febrl3.ipynb @@ -513,7 +513,7 @@ "db_api = DuckDBAPI()\n", "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", " table_or_tables=df,\n", - " blocking_rule_creators=blocking_rules,\n", + " blocking_rules=blocking_rules,\n", " db_api=db_api,\n", " link_type=\"dedupe_only\",\n", " unique_id_column_name=\"rec_id\",\n", diff --git a/docs/demos/examples/duckdb/febrl4.ipynb b/docs/demos/examples/duckdb/febrl4.ipynb index 7761fdc413..f33eac5002 100644 --- a/docs/demos/examples/duckdb/febrl4.ipynb +++ b/docs/demos/examples/duckdb/febrl4.ipynb @@ -1,4151 +1,4151 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "b624dcc9-a6be-4996-8f78-1568568c2e6a", - "metadata": {}, - "source": [ - "## Linking the febrl4 datasets\n", - "\n", - "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data.\n", - "\n", - "It consists of two datasets, A and B, of 5000 records each, with each record in dataset A having a corresponding record in dataset B. The aim will be to capture as many of those 5000 true links as possible, with minimal false linkages.\n", - "\n", - "It is worth noting that we should not necessarily expect to capture _all_ links. There are some links that although we know they _do_ correspond to the same person, the data is so mismatched between them that we would not reasonably expect a model to link them, and indeed should a model do so may indicate that we have overengineered things using our knowledge of true links, which will not be a helpful reference in situations where we attempt to link unlabelled data, as will usually be the case.\n" - ] - }, - { - "cell_type": "markdown", - "id": "32963faf", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9c2be649", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:42.115992Z", - "iopub.status.busy": "2024-05-15T15:56:42.115623Z", - "iopub.status.idle": "2024-05-15T15:56:42.138818Z", - "shell.execute_reply": "2024-05-15T15:56:42.137554Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ] - }, - { - "cell_type": "markdown", - "id": "3547f018-c884-4b9e-a042-3df09a576582", - "metadata": {}, - "source": [ - "### Exploring data and defining model\n" - ] - }, - { - "cell_type": "markdown", - "id": "05a3c2d4-6da8-48d5-89c8-db24702783c7", - "metadata": {}, - "source": [ - "Firstly let's read in the data and have a little look at it\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "832113c9-13b2-43b7-86d0-6051a9db79e8", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:42.144735Z", - "iopub.status.busy": "2024-05-15T15:56:42.144299Z", - "iopub.status.idle": "2024-05-15T15:56:44.123585Z", - "shell.execute_reply": "2024-05-15T15:56:44.122726Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-1070-orgmichaelaneumann8stanley streetmiamiwinston hills4223nsw191511115304218rec-1070
1rec-1016-orgcourtneypainter12pinkerton circuitbega flatsrichlands4560vic191612144066625rec-1016
\n", - "
" - ], - "text/plain": [ - " rec_id given_name surname street_number address_1 \\\n", - "0 rec-1070-org michaela neumann 8 stanley street \n", - "1 rec-1016-org courtney painter 12 pinkerton circuit \n", - "\n", - " address_2 suburb postcode state date_of_birth soc_sec_id \\\n", - "0 miami winston hills 4223 nsw 19151111 5304218 \n", - "1 bega flats richlands 4560 vic 19161214 4066625 \n", - "\n", - " cluster \n", - "0 rec-1070 \n", - "1 rec-1016 " + "cells": [ + { + "cell_type": "markdown", + "id": "b624dcc9-a6be-4996-8f78-1568568c2e6a", + "metadata": {}, + "source": [ + "## Linking the febrl4 datasets\n", + "\n", + "See A.2 [here](https://arxiv.org/pdf/2008.04443.pdf) and [here](https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html) for the source of this data.\n", + "\n", + "It consists of two datasets, A and B, of 5000 records each, with each record in dataset A having a corresponding record in dataset B. The aim will be to capture as many of those 5000 true links as possible, with minimal false linkages.\n", + "\n", + "It is worth noting that we should not necessarily expect to capture _all_ links. There are some links that although we know they _do_ correspond to the same person, the data is so mismatched between them that we would not reasonably expect a model to link them, and indeed should a model do so may indicate that we have overengineered things using our knowledge of true links, which will not be a helpful reference in situations where we attempt to link unlabelled data, as will usually be the case.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-561-dup-0elton3light setreetpinehillwindermere3212vic196510131551941rec-561
1rec-2642-dup-0mitchellmaxon47edkins streetlochaoairnorth ryde3355nsw193902128859999rec-2642
\n", - "
" - ], - "text/plain": [ - " rec_id given_name surname street_number address_1 \\\n", - "0 rec-561-dup-0 elton 3 light setreet \n", - "1 rec-2642-dup-0 mitchell maxon 47 edkins street \n", - "\n", - " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n", - "0 pinehill windermere 3212 vic 19651013 1551941 rec-561 \n", - "1 lochaoair north ryde 3355 nsw 19390212 8859999 rec-2642 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from splink import splink_datasets\n", - "\n", - "df_a = splink_datasets.febrl4a\n", - "df_b = splink_datasets.febrl4b\n", - "\n", - "\n", - "def prepare_data(data):\n", - " data = data.rename(columns=lambda x: x.strip())\n", - " data[\"cluster\"] = data[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n", - " data[\"date_of_birth\"] = data[\"date_of_birth\"].astype(str).str.strip()\n", - " data[\"soc_sec_id\"] = data[\"soc_sec_id\"].astype(str).str.strip()\n", - " data[\"postcode\"] = data[\"postcode\"].astype(str).str.strip()\n", - " return data\n", - "\n", - "\n", - "dfs = [prepare_data(dataset) for dataset in [df_a, df_b]]\n", - "\n", - "display(dfs[0].head(2))\n", - "display(dfs[1].head(2))" - ] - }, - { - "cell_type": "markdown", - "id": "8aebb0dd-28c1-44b8-9e12-e872b97f7583", - "metadata": {}, - "source": [ - "Next, to better understand which variables will prove useful in linking, we have a look at how populated each column is, as well as the distribution of unique values within each\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "3233c3e1-3e6b-4abc-8bed-c26e8b463c2a", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:44.128064Z", - "iopub.status.busy": "2024-05-15T15:56:44.127470Z", - "iopub.status.idle": "2024-05-15T15:56:44.412449Z", - "shell.execute_reply": "2024-05-15T15:56:44.410927Z" - } - }, - "outputs": [], - "source": [ - "from splink import DuckDBAPI, Linker, SettingsCreator\n", - "\n", - "basic_settings = SettingsCreator(\n", - " unique_id_column_name=\"rec_id\",\n", - " link_type=\"link_only\",\n", - " # NB as we are linking one-one, we know the probability that a random pair will be a match\n", - " # hence we could set:\n", - " # \"probability_two_random_records_match\": 1/5000,\n", - " # however we will not specify this here, as we will use this as a check that\n", - " # our estimation procedure returns something sensible\n", - ")\n", - "\n", - "linker = Linker(dfs, basic_settings, database_api=DuckDBAPI())" - ] - }, - { - "cell_type": "markdown", - "id": "c540f670", - "metadata": {}, - "source": [ - "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "319ffdbc-7853-40a9-b331-e635d96b6fdc", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:44.418048Z", - "iopub.status.busy": "2024-05-15T15:56:44.417174Z", - "iopub.status.idle": "2024-05-15T15:56:45.018140Z", - "shell.execute_reply": "2024-05-15T15:56:45.017233Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" + "cell_type": "markdown", + "id": "32963faf", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink.exploratory import completeness_chart\n", - "\n", - "completeness_chart(dfs, db_api=DuckDBAPI())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "dff8dfca-57c8-42bf-878c-da9dd23d2682", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:45.022368Z", - "iopub.status.busy": "2024-05-15T15:56:45.021805Z", - "iopub.status.idle": "2024-05-15T15:56:45.760354Z", - "shell.execute_reply": "2024-05-15T15:56:45.759671Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink.exploratory import profile_columns\n", - "\n", - "profile_columns(dfs, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" - ] - }, - { - "cell_type": "markdown", - "id": "935fc769-8678-494b-96d9-f499c34ae061", - "metadata": {}, - "source": [ - "Next let's come up with some candidate blocking rules, which define which record comparisons are generated, and have a look at how many comparisons each will generate.\n", - "\n", - "For blocking rules that we use in prediction, our aim is to have the union of all rules cover all true matches, whilst avoiding generating so many comparisons that it becomes computationally intractable - i.e. each true match should have at least _one_ of the following conditions holding.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:45.764541Z", - "iopub.status.busy": "2024-05-15T15:56:45.764220Z", - "iopub.status.idle": "2024-05-15T15:56:46.595508Z", - "shell.execute_reply": "2024-05-15T15:56:46.594573Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import DuckDBAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "blocking_rules = [\n", - " block_on(\"given_name\", \"surname\"),\n", - " # A blocking rule can also be an aribtrary SQL expression\n", - " \"l.given_name = r.surname and l.surname = r.given_name\",\n", - " block_on(\"date_of_birth\"),\n", - " block_on(\"soc_sec_id\"),\n", - " block_on(\"state\", \"address_1\"),\n", - " block_on(\"street_number\", \"address_1\"),\n", - " block_on(\"postcode\"),\n", - "]\n", - "\n", - "\n", - "db_api = DuckDBAPI()\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=dfs,\n", - " blocking_rule_creators=blocking_rules,\n", - " db_api=db_api,\n", - " link_type=\"link_only\",\n", - " unique_id_column_name=\"rec_id\",\n", - " source_dataset_column_name=\"source_dataset\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c91c8946-94e3-4ee0-b43f-2d9675339ac9", - "metadata": {}, - "source": [ - "The broadest rule, having a matching postcode, unsurpisingly gives the largest number of comparisons.\n", - "For this small dataset we still have a very manageable number, but if it was larger we might have needed to include a further `AND` condition with it to break the number of comparisons further.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8fe64895-9292-4c86-983e-2ec3f140d12c", - "metadata": {}, - "source": [ - "Now we get the full settings by including the blocking rules, as well as deciding the actual comparisons we will be including in our model.\n", - "\n", - "We will define two models, each with a separate linker with different settings, so that we can compare performance. One will be a very basic model, whilst the other will include a lot more detail.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f6360b69-2d52-4f1a-9199-2edf2339ec63", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:46.600071Z", - "iopub.status.busy": "2024-05-15T15:56:46.599766Z", - "iopub.status.idle": "2024-05-15T15:56:47.112399Z", - "shell.execute_reply": "2024-05-15T15:56:47.111220Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_level_library as cll\n", - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "\n", - "# the simple model only considers a few columns, and only two comparison levels for each\n", - "simple_model_settings = SettingsCreator(\n", - " unique_id_column_name=\"rec_id\",\n", - " link_type=\"link_only\",\n", - " blocking_rules_to_generate_predictions=blocking_rules,\n", - " comparisons=[\n", - " cl.ExactMatch(\"given_name\").configure(term_frequency_adjustments=True),\n", - " cl.ExactMatch(\"surname\").configure(term_frequency_adjustments=True),\n", - " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")\n", - "\n", - "# the detailed model considers more columns, using the information we saw in the exploratory phase\n", - "# we also include further comparison levels to account for typos and other differences\n", - "detailed_model_settings = SettingsCreator(\n", - " unique_id_column_name=\"rec_id\",\n", - " link_type=\"link_only\",\n", - " blocking_rules_to_generate_predictions=blocking_rules,\n", - " comparisons=[\n", - " ctl.NameComparison(\"given_name\").configure(term_frequency_adjustments=True),\n", - " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", - " ctl.DateComparison(\n", - " \"date_of_birth\",\n", - " input_is_string=True,\n", - " datetime_format=\"%Y%m%d\",\n", - " invalid_dates_as_null=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " cl.DamerauLevenshteinAtThresholds(\"soc_sec_id\", [1, 2]),\n", - " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n", - " cl.DamerauLevenshteinAtThresholds(\"postcode\", [1, 2]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " # we don't consider further location columns as they will be strongly correlated with postcode\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")\n", - "\n", - "\n", - "linker_simple = Linker(dfs, simple_model_settings, database_api=DuckDBAPI())\n", - "linker_detailed = Linker(dfs, detailed_model_settings, database_api=DuckDBAPI())" - ] - }, - { - "cell_type": "markdown", - "id": "4b151420-f53b-4dab-9d80-238892cffd53", - "metadata": {}, - "source": [ - "### Estimating model parameters\n" - ] - }, - { - "cell_type": "markdown", - "id": "27f4d86a-3ec0-4d31-a8c7-eae2952e76a4", - "metadata": {}, - "source": [ - "We need to furnish our models with parameter estimates so that we can generate results. We will focus on the detailed model, generating the values for the simple model at the end\n" - ] - }, - { - "cell_type": "markdown", - "id": "3684d83f-44ce-46af-b3bd-0725f001b8d4", - "metadata": {}, - "source": [ - "We can instead estimate the probability two random records match, and compare with the known value of 1/5000 = 0.0002, to see how well our estimation procedure works.\n", - "\n", - "To do this we come up with some deterministic rules - the aim here is that we generate very few false positives (i.e. we expect that the majority of records with at least one of these conditions holding are true matches), whilst also capturing the majority of matches - our guess here is that these two rules should capture 80% of all matches.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "7ad48419-4eda-4fe5-b00f-2ec9f798e0e8", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:47.118143Z", - "iopub.status.busy": "2024-05-15T15:56:47.117804Z", - "iopub.status.idle": "2024-05-15T15:56:47.491169Z", - "shell.execute_reply": "2024-05-15T15:56:47.489974Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.000239.\n", - "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" - ] - } - ], - "source": [ - "deterministic_rules = [\n", - " block_on(\"soc_sec_id\"),\n", - " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n", - "]\n", - "\n", - "linker_detailed.estimate_probability_two_random_records_match(\n", - " deterministic_rules, recall=0.8\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "0e035592-b1bb-4e27-a5b9-e890810088fb", - "metadata": {}, - "source": [ - "Even playing around with changing these deterministic rules, or the nominal recall leaves us with an answer which is pretty close to our known value\n" - ] - }, - { - "cell_type": "markdown", - "id": "bdaaa245-4bd9-476c-9ead-c5f28597aa7e", - "metadata": {}, - "source": [ - "Next we estimate `u` and `m` values for each comparison, so that we can move to generating predictions\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e40ee288-0c42-4cda-aaf1-3ffb2ea02383", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:47.497349Z", - "iopub.status.busy": "2024-05-15T15:56:47.496965Z", - "iopub.status.idle": "2024-05-15T15:56:59.095072Z", - "shell.execute_reply": "2024-05-15T15:56:59.094337Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - given_name (no m values are trained).\n", - " - surname (no m values are trained).\n", - " - date_of_birth (some u values are not trained, no m values are trained).\n", - " - soc_sec_id (no m values are trained).\n", - " - street_number (no m values are trained).\n", - " - postcode (no m values are trained).\n" - ] - } - ], - "source": [ - "# We generally recommend setting max pairs higher (e.g. 1e7 or more)\n", - "# But this will run faster for the purpose of this demo\n", - "linker_detailed.estimate_u_using_random_sampling(max_pairs=1e6)" - ] - }, - { - "cell_type": "markdown", - "id": "614f6e19-14bb-4d40-9b95-36593b6de9ba", - "metadata": {}, - "source": [ - "When training the `m` values using expectation maximisation, we need somre more blocking rules to reduce the total number of comparisons. For each rule, we want to ensure that we have neither proportionally too many matches, or too few.\n", - "\n", - "We must run this multiple times using different rules so that we can obtain estimates for all comparisons - if we block on e.g. `date_of_birth`, then we cannot compute the `m` values for the `date_of_birth` comparison, as we have only looked at records where these match.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "9ee0f49b-084c-45aa-8c6b-ec5da11c2cc4", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:56:59.100504Z", - "iopub.status.busy": "2024-05-15T15:56:59.100174Z", - "iopub.status.idle": "2024-05-15T15:57:01.059609Z", - "shell.execute_reply": "2024-05-15T15:57:01.058521Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"date_of_birth\" = r.\"date_of_birth\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - given_name\n", - " - surname\n", - " - soc_sec_id\n", - " - street_number\n", - " - postcode\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - date_of_birth\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.312 in probability_two_random_records_match\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was 0.00363 in the m_probability of given_name, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 8.66e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 3 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - date_of_birth (some u values are not trained, no m values are trained).\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"postcode\" = r.\"postcode\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - given_name\n", - " - surname\n", - " - date_of_birth\n", - " - soc_sec_id\n", - " - street_number\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - postcode\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 1, + "id": "9c2be649", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:42.115992Z", + "iopub.status.busy": "2024-05-15T15:56:42.115623Z", + "iopub.status.idle": "2024-05-15T15:56:42.138818Z", + "shell.execute_reply": "2024-05-15T15:56:42.137554Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was 0.000462 in the m_probability of date_of_birth, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "3547f018-c884-4b9e-a042-3df09a576582", + "metadata": {}, + "source": [ + "### Exploring data and defining model\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 8.02e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "05a3c2d4-6da8-48d5-89c8-db24702783c7", + "metadata": {}, + "source": [ + "Firstly let's read in the data and have a little look at it\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 3 iterations\n" - ] + "cell_type": "code", + "execution_count": 2, + "id": "832113c9-13b2-43b7-86d0-6051a9db79e8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:42.144735Z", + "iopub.status.busy": "2024-05-15T15:56:42.144299Z", + "iopub.status.idle": "2024-05-15T15:56:44.123585Z", + "shell.execute_reply": "2024-05-15T15:56:44.122726Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-1070-orgmichaelaneumann8stanley streetmiamiwinston hills4223nsw191511115304218rec-1070
1rec-1016-orgcourtneypainter12pinkerton circuitbega flatsrichlands4560vic191612144066625rec-1016
\n", + "
" + ], + "text/plain": [ + " rec_id given_name surname street_number address_1 \\\n", + "0 rec-1070-org michaela neumann 8 stanley street \n", + "1 rec-1016-org courtney painter 12 pinkerton circuit \n", + "\n", + " address_2 suburb postcode state date_of_birth soc_sec_id \\\n", + "0 miami winston hills 4223 nsw 19151111 5304218 \n", + "1 bega flats richlands 4560 vic 19161214 4066625 \n", + "\n", + " cluster \n", + "0 rec-1070 \n", + "1 rec-1016 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rec_idgiven_namesurnamestreet_numberaddress_1address_2suburbpostcodestatedate_of_birthsoc_sec_idcluster
0rec-561-dup-0elton3light setreetpinehillwindermere3212vic196510131551941rec-561
1rec-2642-dup-0mitchellmaxon47edkins streetlochaoairnorth ryde3355nsw193902128859999rec-2642
\n", + "
" + ], + "text/plain": [ + " rec_id given_name surname street_number address_1 \\\n", + "0 rec-561-dup-0 elton 3 light setreet \n", + "1 rec-2642-dup-0 mitchell maxon 47 edkins street \n", + "\n", + " address_2 suburb postcode state date_of_birth soc_sec_id cluster \n", + "0 pinehill windermere 3212 vic 19651013 1551941 rec-561 \n", + "1 lochaoair north ryde 3355 nsw 19390212 8859999 rec-2642 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from splink import splink_datasets\n", + "\n", + "df_a = splink_datasets.febrl4a\n", + "df_b = splink_datasets.febrl4b\n", + "\n", + "\n", + "def prepare_data(data):\n", + " data = data.rename(columns=lambda x: x.strip())\n", + " data[\"cluster\"] = data[\"rec_id\"].apply(lambda x: \"-\".join(x.split(\"-\")[:2]))\n", + " data[\"date_of_birth\"] = data[\"date_of_birth\"].astype(str).str.strip()\n", + " data[\"soc_sec_id\"] = data[\"soc_sec_id\"].astype(str).str.strip()\n", + " data[\"postcode\"] = data[\"postcode\"].astype(str).str.strip()\n", + " return data\n", + "\n", + "\n", + "dfs = [prepare_data(dataset) for dataset in [df_a, df_b]]\n", + "\n", + "display(dfs[0].head(2))\n", + "display(dfs[1].head(2))" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" - ] + "cell_type": "markdown", + "id": "8aebb0dd-28c1-44b8-9e12-e872b97f7583", + "metadata": {}, + "source": [ + "Next, to better understand which variables will prove useful in linking, we have a look at how populated each column is, as well as the distribution of unique values within each\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" - ] + "cell_type": "code", + "execution_count": 3, + "id": "3233c3e1-3e6b-4abc-8bed-c26e8b463c2a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:44.128064Z", + "iopub.status.busy": "2024-05-15T15:56:44.127470Z", + "iopub.status.idle": "2024-05-15T15:56:44.412449Z", + "shell.execute_reply": "2024-05-15T15:56:44.410927Z" + } + }, + "outputs": [], + "source": [ + "from splink import DuckDBAPI, Linker, SettingsCreator\n", + "\n", + "basic_settings = SettingsCreator(\n", + " unique_id_column_name=\"rec_id\",\n", + " link_type=\"link_only\",\n", + " # NB as we are linking one-one, we know the probability that a random pair will be a match\n", + " # hence we could set:\n", + " # \"probability_two_random_records_match\": 1/5000,\n", + " # however we will not specify this here, as we will use this as a check that\n", + " # our estimation procedure returns something sensible\n", + ")\n", + "\n", + "linker = Linker(dfs, basic_settings, database_api=DuckDBAPI())" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" - ] + "cell_type": "markdown", + "id": "c540f670", + "metadata": {}, + "source": [ + "It's usually a good idea to perform exploratory analysis on your data so you understand what's in each column and how often it's missing\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - date_of_birth (some u values are not trained, some m values are not trained).\n" - ] - } - ], - "source": [ - "session_dob = linker_detailed.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"date_of_birth\"), estimate_without_term_frequencies=True\n", - ")\n", - "session_pc = linker_detailed.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"postcode\"), estimate_without_term_frequencies=True\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ba8ed5fa-7003-46a9-bc40-4ae7cfb40953", - "metadata": {}, - "source": [ - "If we wish we can have a look at how our parameter estimates changes over these training sessions\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "31ef6844-6be8-4f01-9ff7-5dfebcf12ae1", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:01.065654Z", - "iopub.status.busy": "2024-05-15T15:57:01.065325Z", - "iopub.status.idle": "2024-05-15T15:57:01.389061Z", - "shell.execute_reply": "2024-05-15T15:57:01.388339Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 4, + "id": "319ffdbc-7853-40a9-b331-e635d96b6fdc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:44.418048Z", + "iopub.status.busy": "2024-05-15T15:56:44.417174Z", + "iopub.status.idle": "2024-05-15T15:56:45.018140Z", + "shell.execute_reply": "2024-05-15T15:56:45.017233Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.HConcatChart(...)" + "source": [ + "from splink.exploratory import completeness_chart\n", + "\n", + "completeness_chart(dfs, db_api=DuckDBAPI())" ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session_dob.m_u_values_interactive_history_chart()" - ] - }, - { - "cell_type": "markdown", - "id": "cffd7f8f-6cea-4ef7-87c7-c6a9c1775cf2", - "metadata": {}, - "source": [ - "For variables that aren't used in the `m`-training blocking rules, we have two estimates --- one from each of the training sessions (see for example `street_number`). We can have a look at how the values compare between them, to ensure that we don't have drastically different values, which may be indicative of an issue.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8d260a60-a4fa-4c0d-9853-8b8256a24257", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:01.393145Z", - "iopub.status.busy": "2024-05-15T15:57:01.392842Z", - "iopub.status.idle": "2024-05-15T15:57:01.561233Z", - "shell.execute_reply": "2024-05-15T15:57:01.560475Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 5, + "id": "dff8dfca-57c8-42bf-878c-da9dd23d2682", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:45.022368Z", + "iopub.status.busy": "2024-05-15T15:56:45.021805Z", + "iopub.status.idle": "2024-05-15T15:56:45.760354Z", + "shell.execute_reply": "2024-05-15T15:56:45.759671Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.Chart(...)" + "source": [ + "from splink.exploratory import profile_columns\n", + "\n", + "profile_columns(dfs, db_api=DuckDBAPI(), column_expressions=[\"given_name\", \"surname\"])" ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_detailed.parameter_estimate_comparisons_chart()" - ] - }, - { - "cell_type": "markdown", - "id": "25e3e343-603a-4aed-a5ac-5de42af5f8ad", - "metadata": {}, - "source": [ - "We repeat our parameter estimations for the simple model in much the same fashion\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "71f2f166-05cd-4038-a289-a053a1f0b5c5", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:01.565611Z", - "iopub.status.busy": "2024-05-15T15:57:01.565220Z", - "iopub.status.idle": "2024-05-15T15:57:04.177024Z", - "shell.execute_reply": "2024-05-15T15:57:04.176371Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.000239.\n", - "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - given_name (no m values are trained).\n", - " - surname (no m values are trained).\n", - " - street_number (no m values are trained).\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"given_name\" = r.\"given_name\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - surname\n", - " - street_number\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - given_name\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was 0.0821 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.0237 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0222 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.0205 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.018 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was 0.0151 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was -0.0123 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was -0.0097 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was 0.00751 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was 0.00573 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was 0.00434 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 12: Largest change in params was -0.00326 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 13: Largest change in params was 0.00245 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 14: Largest change in params was -0.00183 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 15: Largest change in params was 0.00137 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 16: Largest change in params was -0.00103 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 17: Largest change in params was -0.000769 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 18: Largest change in params was -0.000576 in the m_probability of surname, level `Exact match on surname`\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 19: Largest change in params was 0.000432 in the m_probability of surname, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 20: Largest change in params was -0.000324 in the m_probability of surname, level `Exact match on surname`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 21: Largest change in params was 0.000243 in the m_probability of surname, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "935fc769-8678-494b-96d9-f499c34ae061", + "metadata": {}, + "source": [ + "Next let's come up with some candidate blocking rules, which define which record comparisons are generated, and have a look at how many comparisons each will generate.\n", + "\n", + "For blocking rules that we use in prediction, our aim is to have the union of all rules cover all true matches, whilst avoiding generating so many comparisons that it becomes computationally intractable - i.e. each true match should have at least _one_ of the following conditions holding.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 22: Largest change in params was -0.000182 in the m_probability of surname, level `Exact match on surname`\n" - ] + "cell_type": "code", + "execution_count": 6, + "id": "e745280e-fe2f-4563-bd7e-6e4c70d0c9de", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:45.764541Z", + "iopub.status.busy": "2024-05-15T15:56:45.764220Z", + "iopub.status.idle": "2024-05-15T15:56:46.595508Z", + "shell.execute_reply": "2024-05-15T15:56:46.594573Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules = [\n", + " block_on(\"given_name\", \"surname\"),\n", + " # A blocking rule can also be an aribtrary SQL expression\n", + " \"l.given_name = r.surname and l.surname = r.given_name\",\n", + " block_on(\"date_of_birth\"),\n", + " block_on(\"soc_sec_id\"),\n", + " block_on(\"state\", \"address_1\"),\n", + " block_on(\"street_number\", \"address_1\"),\n", + " block_on(\"postcode\"),\n", + "]\n", + "\n", + "\n", + "db_api = DuckDBAPI()\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=dfs,\n", + " blocking_rules=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"rec_id\",\n", + " source_dataset_column_name=\"source_dataset\",\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 23: Largest change in params was -0.000137 in the m_probability of surname, level `Exact match on surname`\n" - ] + "cell_type": "markdown", + "id": "c91c8946-94e3-4ee0-b43f-2d9675339ac9", + "metadata": {}, + "source": [ + "The broadest rule, having a matching postcode, unsurpisingly gives the largest number of comparisons.\n", + "For this small dataset we still have a very manageable number, but if it was larger we might have needed to include a further `AND` condition with it to break the number of comparisons further.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 24: Largest change in params was -0.000103 in the m_probability of surname, level `Exact match on surname`\n" - ] + "cell_type": "markdown", + "id": "8fe64895-9292-4c86-983e-2ec3f140d12c", + "metadata": {}, + "source": [ + "Now we get the full settings by including the blocking rules, as well as deciding the actual comparisons we will be including in our model.\n", + "\n", + "We will define two models, each with a separate linker with different settings, so that we can compare performance. One will be a very basic model, whilst the other will include a lot more detail.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 25: Largest change in params was -7.73e-05 in the m_probability of surname, level `Exact match on surname`\n" - ] + "cell_type": "code", + "execution_count": 7, + "id": "f6360b69-2d52-4f1a-9199-2edf2339ec63", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:46.600071Z", + "iopub.status.busy": "2024-05-15T15:56:46.599766Z", + "iopub.status.idle": "2024-05-15T15:56:47.112399Z", + "shell.execute_reply": "2024-05-15T15:56:47.111220Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_level_library as cll\n", + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "\n", + "# the simple model only considers a few columns, and only two comparison levels for each\n", + "simple_model_settings = SettingsCreator(\n", + " unique_id_column_name=\"rec_id\",\n", + " link_type=\"link_only\",\n", + " blocking_rules_to_generate_predictions=blocking_rules,\n", + " comparisons=[\n", + " cl.ExactMatch(\"given_name\").configure(term_frequency_adjustments=True),\n", + " cl.ExactMatch(\"surname\").configure(term_frequency_adjustments=True),\n", + " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "# the detailed model considers more columns, using the information we saw in the exploratory phase\n", + "# we also include further comparison levels to account for typos and other differences\n", + "detailed_model_settings = SettingsCreator(\n", + " unique_id_column_name=\"rec_id\",\n", + " link_type=\"link_only\",\n", + " blocking_rules_to_generate_predictions=blocking_rules,\n", + " comparisons=[\n", + " ctl.NameComparison(\"given_name\").configure(term_frequency_adjustments=True),\n", + " ctl.NameComparison(\"surname\").configure(term_frequency_adjustments=True),\n", + " ctl.DateComparison(\n", + " \"date_of_birth\",\n", + " input_is_string=True,\n", + " datetime_format=\"%Y%m%d\",\n", + " invalid_dates_as_null=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " cl.DamerauLevenshteinAtThresholds(\"soc_sec_id\", [1, 2]),\n", + " cl.ExactMatch(\"street_number\").configure(term_frequency_adjustments=True),\n", + " cl.DamerauLevenshteinAtThresholds(\"postcode\", [1, 2]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " # we don't consider further location columns as they will be strongly correlated with postcode\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "\n", + "linker_simple = Linker(dfs, simple_model_settings, database_api=DuckDBAPI())\n", + "linker_detailed = Linker(dfs, detailed_model_settings, database_api=DuckDBAPI())" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 25 iterations\n" - ] + "cell_type": "markdown", + "id": "4b151420-f53b-4dab-9d80-238892cffd53", + "metadata": {}, + "source": [ + "### Estimating model parameters\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - given_name (no m values are trained).\n" - ] + "cell_type": "markdown", + "id": "27f4d86a-3ec0-4d31-a8c7-eae2952e76a4", + "metadata": {}, + "source": [ + "We need to furnish our models with parameter estimates so that we can generate results. We will focus on the detailed model, generating the values for the simple model at the end\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] + "cell_type": "markdown", + "id": "3684d83f-44ce-46af-b3bd-0725f001b8d4", + "metadata": {}, + "source": [ + "We can instead estimate the probability two random records match, and compare with the known value of 1/5000 = 0.0002, to see how well our estimation procedure works.\n", + "\n", + "To do this we come up with some deterministic rules - the aim here is that we generate very few false positives (i.e. we expect that the majority of records with at least one of these conditions holding are true matches), whilst also capturing the majority of matches - our guess here is that these two rules should capture 80% of all matches.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"street_number\" = r.\"street_number\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - given_name\n", - " - surname\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - street_number\n" - ] + "cell_type": "code", + "execution_count": 8, + "id": "7ad48419-4eda-4fe5-b00f-2ec9f798e0e8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:47.118143Z", + "iopub.status.busy": "2024-05-15T15:56:47.117804Z", + "iopub.status.idle": "2024-05-15T15:56:47.491169Z", + "shell.execute_reply": "2024-05-15T15:56:47.489974Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000239.\n", + "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" + ] + } + ], + "source": [ + "deterministic_rules = [\n", + " block_on(\"soc_sec_id\"),\n", + " block_on(\"given_name\", \"surname\", \"date_of_birth\"),\n", + "]\n", + "\n", + "linker_detailed.estimate_probability_two_random_records_match(\n", + " deterministic_rules, recall=0.8\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "cell_type": "markdown", + "id": "0e035592-b1bb-4e27-a5b9-e890810088fb", + "metadata": {}, + "source": [ + "Even playing around with changing these deterministic rules, or the nominal recall leaves us with an answer which is pretty close to our known value\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was 0.0513 in the m_probability of surname, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "bdaaa245-4bd9-476c-9ead-c5f28597aa7e", + "metadata": {}, + "source": [ + "Next we estimate `u` and `m` values for each comparison, so that we can move to generating predictions\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.025 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "code", + "execution_count": 9, + "id": "e40ee288-0c42-4cda-aaf1-3ffb2ea02383", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:47.497349Z", + "iopub.status.busy": "2024-05-15T15:56:47.496965Z", + "iopub.status.idle": "2024-05-15T15:56:59.095072Z", + "shell.execute_reply": "2024-05-15T15:56:59.094337Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n", + " - soc_sec_id (no m values are trained).\n", + " - street_number (no m values are trained).\n", + " - postcode (no m values are trained).\n" + ] + } + ], + "source": [ + "# We generally recommend setting max pairs higher (e.g. 1e7 or more)\n", + "# But this will run faster for the purpose of this demo\n", + "linker_detailed.estimate_u_using_random_sampling(max_pairs=1e6)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was -0.0251 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "614f6e19-14bb-4d40-9b95-36593b6de9ba", + "metadata": {}, + "source": [ + "When training the `m` values using expectation maximisation, we need somre more blocking rules to reduce the total number of comparisons. For each rule, we want to ensure that we have neither proportionally too many matches, or too few.\n", + "\n", + "We must run this multiple times using different rules so that we can obtain estimates for all comparisons - if we block on e.g. `date_of_birth`, then we cannot compute the `m` values for the `date_of_birth` comparison, as we have only looked at records where these match.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was 0.0246 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 10, + "id": "9ee0f49b-084c-45aa-8c6b-ec5da11c2cc4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:56:59.100504Z", + "iopub.status.busy": "2024-05-15T15:56:59.100174Z", + "iopub.status.idle": "2024-05-15T15:57:01.059609Z", + "shell.execute_reply": "2024-05-15T15:57:01.058521Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"date_of_birth\" = r.\"date_of_birth\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + " - soc_sec_id\n", + " - street_number\n", + " - postcode\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - date_of_birth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.312 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.00363 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 8.66e-05 in the m_probability of soc_sec_id, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"postcode\" = r.\"postcode\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + " - date_of_birth\n", + " - soc_sec_id\n", + " - street_number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - postcode\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0374 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.000462 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 8.02e-06 in the m_probability of soc_sec_id, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - date_of_birth (some u values are not trained, some m values are not trained).\n" + ] + } + ], + "source": [ + "session_dob = linker_detailed.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"date_of_birth\"), estimate_without_term_frequencies=True\n", + ")\n", + "session_pc = linker_detailed.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"postcode\"), estimate_without_term_frequencies=True\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.0228 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "ba8ed5fa-7003-46a9-bc40-4ae7cfb40953", + "metadata": {}, + "source": [ + "If we wish we can have a look at how our parameter estimates changes over these training sessions\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was -0.0199 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "code", + "execution_count": 11, + "id": "31ef6844-6be8-4f01-9ff7-5dfebcf12ae1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:01.065654Z", + "iopub.status.busy": "2024-05-15T15:57:01.065325Z", + "iopub.status.idle": "2024-05-15T15:57:01.389061Z", + "shell.execute_reply": "2024-05-15T15:57:01.388339Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session_dob.m_u_values_interactive_history_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was -0.0166 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "cffd7f8f-6cea-4ef7-87c7-c6a9c1775cf2", + "metadata": {}, + "source": [ + "For variables that aren't used in the `m`-training blocking rules, we have two estimates --- one from each of the training sessions (see for example `street_number`). We can have a look at how the values compare between them, to ensure that we don't have drastically different values, which may be indicative of an issue.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was 0.0134 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 12, + "id": "8d260a60-a4fa-4c0d-9853-8b8256a24257", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:01.393145Z", + "iopub.status.busy": "2024-05-15T15:57:01.392842Z", + "iopub.status.idle": "2024-05-15T15:57:01.561233Z", + "shell.execute_reply": "2024-05-15T15:57:01.560475Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker_detailed.parameter_estimate_comparisons_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was -0.0105 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "25e3e343-603a-4aed-a5ac-5de42af5f8ad", + "metadata": {}, + "source": [ + "We repeat our parameter estimations for the simple model in much the same fashion\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was -0.00801 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "code", + "execution_count": 13, + "id": "71f2f166-05cd-4038-a289-a053a1f0b5c5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:01.565611Z", + "iopub.status.busy": "2024-05-15T15:57:01.565220Z", + "iopub.status.idle": "2024-05-15T15:57:04.177024Z", + "shell.execute_reply": "2024-05-15T15:57:04.176371Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000239.\n", + "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - street_number (no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"given_name\" = r.\"given_name\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - surname\n", + " - street_number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - given_name\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0821 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0237 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0222 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was -0.0205 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.018 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.0151 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.0123 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was -0.0097 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.00751 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.00573 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 0.00434 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was -0.00326 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was 0.00245 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was -0.00183 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was 0.00137 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was -0.00103 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -0.000769 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was -0.000576 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was 0.000432 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was -0.000324 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 21: Largest change in params was 0.000243 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 22: Largest change in params was -0.000182 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 23: Largest change in params was -0.000137 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 24: Largest change in params was -0.000103 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 25: Largest change in params was -7.73e-05 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 25 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - given_name (no m values are trained).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"street_number\" = r.\"street_number\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - given_name\n", + " - surname\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - street_number\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0513 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.025 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.0251 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.0246 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.0228 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was -0.0199 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.0166 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.0134 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.0105 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was -0.00801 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 0.00607 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was -0.00457 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was 0.00344 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was 0.0026 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was -0.00197 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.0015 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -0.00115 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was 0.00089 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was -0.000693 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was 0.000542 in the m_probability of given_name, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 21: Largest change in params was -0.000426 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 22: Largest change in params was -0.000337 in the m_probability of given_name, level `Exact match on given_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 23: Largest change in params was 0.000274 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 24: Largest change in params was -0.000224 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 25: Largest change in params was 0.000182 in the m_probability of surname, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 25 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker_simple.estimate_probability_two_random_records_match(\n", + " deterministic_rules, recall=0.8\n", + ")\n", + "linker_simple.estimate_u_using_random_sampling(max_pairs=1e7)\n", + "session_ssid = linker_simple.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"given_name\"), estimate_without_term_frequencies=True\n", + ")\n", + "session_pc = linker_simple.estimate_parameters_using_expectation_maximisation(\n", + " block_on(\"street_number\"), estimate_without_term_frequencies=True\n", + ")\n", + "linker_simple.parameter_estimate_comparisons_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was 0.00607 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 14, + "id": "3a87cb78-0e97-40a3-b757-6c99bb19d7b1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:04.180496Z", + "iopub.status.busy": "2024-05-15T15:57:04.180247Z", + "iopub.status.idle": "2024-05-15T15:57:04.183145Z", + "shell.execute_reply": "2024-05-15T15:57:04.182523Z" + } + }, + "outputs": [], + "source": [ + "# import json\n", + "# we can have a look at the full settings if we wish, including the values of our estimated parameters:\n", + "# print(json.dumps(linker_detailed._settings_obj.as_dict(), indent=2))\n", + "# we can also get a handy summary of of the model in an easily readable format if we wish:\n", + "# print(linker_detailed._settings_obj.human_readable_description)\n", + "# (we suppress output here for brevity)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 12: Largest change in params was -0.00457 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "76f453df-848b-4f06-bbb7-d88ee710ae64", + "metadata": {}, + "source": [ + "We can now visualise some of the details of our models. We can look at the match weights, which tell us the relative importance for/against a match for each of our comparsion levels.\n", + "\n", + "Comparing the two models will show the added benefit we get in the more detailed model --- what in the simple model is classed as 'all other comparisons' is instead broken down further, and we can see that the detail of how this is broken down in fact gives us quite a bit of useful information about the likelihood of a match.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 13: Largest change in params was 0.00344 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 15, + "id": "b17b131c-c83e-4c32-bfad-c12021d2c3b7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:04.186220Z", + "iopub.status.busy": "2024-05-15T15:57:04.185782Z", + "iopub.status.idle": "2024-05-15T15:57:04.541188Z", + "shell.execute_reply": "2024-05-15T15:57:04.540169Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker_simple.match_weights_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 14: Largest change in params was 0.0026 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 16, + "id": "c095ff2b-405b-427c-849f-1468f6ca98e0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:04.545921Z", + "iopub.status.busy": "2024-05-15T15:57:04.545071Z", + "iopub.status.idle": "2024-05-15T15:57:04.888788Z", + "shell.execute_reply": "2024-05-15T15:57:04.887944Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker_detailed.match_weights_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 15: Largest change in params was -0.00197 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "e08287f4-711a-4960-b6e6-3f3d19ca8667", + "metadata": {}, + "source": [ + "As well as the match weights, which give us an idea of the overall effect of each comparison level, we can also look at the individual `u` and `m` parameter estimates, which tells us about the prevalence of coincidences and mistakes (for further details/explanation about this see [this article](https://www.robinlinacre.com/maths_of_fellegi_sunter/)). We might want to revise aspects of our model based on the information we ascertain here.\n", + "\n", + "Note however that some of these values are very small, which is why the match weight chart is often more useful for getting a decent picture of things.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 16: Largest change in params was 0.0015 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 17, + "id": "26e5dbe5-a621-44ab-bdb4-0bcd53b220b6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:04.893722Z", + "iopub.status.busy": "2024-05-15T15:57:04.893207Z", + "iopub.status.idle": "2024-05-15T15:57:05.067224Z", + "shell.execute_reply": "2024-05-15T15:57:05.066686Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# linker_simple.m_u_parameters_chart()\n", + "linker_detailed.m_u_parameters_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 17: Largest change in params was -0.00115 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "67321657-d2a0-4f7c-b68a-a906a210547e", + "metadata": {}, + "source": [ + "It is also useful to have a look at unlinkable records - these are records which do not contain enough information to be linked at some match probability threshold. We can figure this out be seeing whether records are able to be matched with themselves.\n", + "\n", + "This is of course relative to the information we have put into the model - we see that in our simple model, at a 99% match threshold nearly 10% of records are unlinkable, as we have not included enough information in the model for distinct records to be adequately distinguished; this is not an issue in our more detailed model.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 18: Largest change in params was 0.00089 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 18, + "id": "149962d6-a2ad-412f-aa05-8697beb12ed0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:05.070283Z", + "iopub.status.busy": "2024-05-15T15:57:05.070040Z", + "iopub.status.idle": "2024-05-15T15:57:06.960773Z", + "shell.execute_reply": "2024-05-15T15:57:06.959848Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker_simple.unlinkables_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 19: Largest change in params was -0.000693 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "code", + "execution_count": 19, + "id": "cac493dd-ea43-4550-8fd4-f758ae90ed75", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:06.965159Z", + "iopub.status.busy": "2024-05-15T15:57:06.964863Z", + "iopub.status.idle": "2024-05-15T15:57:07.337075Z", + "shell.execute_reply": "2024-05-15T15:57:07.336337Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker_detailed.unlinkables_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 20: Largest change in params was 0.000542 in the m_probability of given_name, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "66244ba3-7397-466a-889e-c85f90db1e82", + "metadata": {}, + "source": [ + "Our simple model doesn't do _terribly_, but suffers if we want to have a high match probability --- to be 99% (match weight ~7) certain of matches we have ~10% of records that we will be unable to link.\n", + "\n", + "Our detailed model, however, has enough nuance that we can at least self-link records.\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 21: Largest change in params was -0.000426 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "markdown", + "id": "061e1355-557a-457d-92b6-2589b32371da", + "metadata": {}, + "source": [ + "### Predictions\n", + "\n", + "Now that we have had a look into the details of the models, we will focus on only our more detailed model, which should be able to capture more of the genuine links in our data\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 22: Largest change in params was -0.000337 in the m_probability of given_name, level `Exact match on given_name`\n" - ] + "cell_type": "code", + "execution_count": 20, + "id": "03348477-c3c1-42e7-a8af-8f678acc9d58", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:07.340733Z", + "iopub.status.busy": "2024-05-15T15:57:07.340494Z", + "iopub.status.idle": "2024-05-15T15:57:12.239689Z", + "shell.execute_reply": "2024-05-15T15:57:12.238900Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrec_id_lrec_id_rgiven_name_lgiven_name_rgamma_given_nametf_given_name_l...gamma_postcodetf_postcode_ltf_postcode_rbf_postcodebf_tf_adj_postcodeaddress_1_laddress_1_rstate_lstate_rmatch_key
0-1.8239180.220244__splink__input_table_0__splink__input_table_1rec-760-orgrec-3951-dup-0lachlanlachlan30.0113...30.00070.0007791.8172091.518489bushby closetemplestoew avenuenswvic0
1-1.6562300.240853__splink__input_table_0__splink__input_table_1rec-4980-orgrec-4980-dup-0isabellactercteko00.0069...30.00040.0004791.8172092.657355sturt avenuesturta venuevicvic2
2-1.0615200.323926__splink__input_table_0__splink__input_table_1rec-585-orgrec-585-dup-0dannystephenson00.0001...20.00160.001211.7910101.000000o'shanassy streeto'shanassy streettastas1
3-0.8815660.351819__splink__input_table_0__splink__input_table_1rec-1250-orgrec-1250-dup-0lukegazzola00.0055...20.00150.000211.7910101.000000newman morris circuitnewman morr is circuitnswnsw1
4-0.1801550.468822__splink__input_table_0__splink__input_table_1rec-4763-orgrec-4763-dup-0maxalisha00.0021...10.00040.00160.0449831.000000duffy streetduffy s treetnswnsw2
\n", + "

5 rows × 47 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l \\\n", + "0 -1.823918 0.220244 __splink__input_table_0 \n", + "1 -1.656230 0.240853 __splink__input_table_0 \n", + "2 -1.061520 0.323926 __splink__input_table_0 \n", + "3 -0.881566 0.351819 __splink__input_table_0 \n", + "4 -0.180155 0.468822 __splink__input_table_0 \n", + "\n", + " source_dataset_r rec_id_l rec_id_r given_name_l \\\n", + "0 __splink__input_table_1 rec-760-org rec-3951-dup-0 lachlan \n", + "1 __splink__input_table_1 rec-4980-org rec-4980-dup-0 isabella \n", + "2 __splink__input_table_1 rec-585-org rec-585-dup-0 danny \n", + "3 __splink__input_table_1 rec-1250-org rec-1250-dup-0 luke \n", + "4 __splink__input_table_1 rec-4763-org rec-4763-dup-0 max \n", + "\n", + " given_name_r gamma_given_name tf_given_name_l ... gamma_postcode \\\n", + "0 lachlan 3 0.0113 ... 3 \n", + "1 ctercteko 0 0.0069 ... 3 \n", + "2 stephenson 0 0.0001 ... 2 \n", + "3 gazzola 0 0.0055 ... 2 \n", + "4 alisha 0 0.0021 ... 1 \n", + "\n", + " tf_postcode_l tf_postcode_r bf_postcode bf_tf_adj_postcode \\\n", + "0 0.0007 0.0007 791.817209 1.518489 \n", + "1 0.0004 0.0004 791.817209 2.657355 \n", + "2 0.0016 0.0012 11.791010 1.000000 \n", + "3 0.0015 0.0002 11.791010 1.000000 \n", + "4 0.0004 0.0016 0.044983 1.000000 \n", + "\n", + " address_1_l address_1_r state_l state_r \\\n", + "0 bushby close templestoew avenue nsw vic \n", + "1 sturt avenue sturta venue vic vic \n", + "2 o'shanassy street o'shanassy street tas tas \n", + "3 newman morris circuit newman morr is circuit nsw nsw \n", + "4 duffy street duffy s treet nsw nsw \n", + "\n", + " match_key \n", + "0 0 \n", + "1 2 \n", + "2 1 \n", + "3 1 \n", + "4 2 \n", + "\n", + "[5 rows x 47 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = linker_detailed.predict(threshold_match_probability=0.2)\n", + "df_predictions = predictions.as_pandas_dataframe()\n", + "df_predictions.head(5)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 23: Largest change in params was 0.000274 in the m_probability of surname, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "fd32d127-5012-42e8-9f69-89237992a793", + "metadata": {}, + "source": [ + "We can see how our model performs at different probability thresholds, with a couple of options depending on the space we wish to view things\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 24: Largest change in params was -0.000224 in the m_probability of surname, level `Exact match on surname`\n" - ] + "cell_type": "code", + "execution_count": 21, + "id": "ce8d409c-7ef5-4485-9ec0-8b539fdecb1f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:12.244377Z", + "iopub.status.busy": "2024-05-15T15:57:12.243938Z", + "iopub.status.idle": "2024-05-15T15:57:15.174716Z", + "shell.execute_reply": "2024-05-15T15:57:15.173769Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# linker_detailed.roc_chart_from_labels_column(\"cluster\")\n", + "linker_detailed.precision_recall_chart_from_labels_column(\n", + " \"cluster\", match_weight_round_to_nearest=0.1\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 25: Largest change in params was 0.000182 in the m_probability of surname, level `All other comparisons`\n" - ] + "cell_type": "markdown", + "id": "568b990e-982a-4adc-9629-06ba30f872b0", + "metadata": {}, + "source": [ + "and we can easily see how many individuals we identify and link by looking at clusters generated at some threshold match probability of interest - in this example 99%\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 25 iterations\n" - ] + "cell_type": "code", + "execution_count": 22, + "id": "ade53248-212f-4776-8d7d-4632b1749425", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:15.183049Z", + "iopub.status.busy": "2024-05-15T15:57:15.182695Z", + "iopub.status.idle": "2024-05-15T15:57:15.493444Z", + "shell.execute_reply": "2024-05-15T15:57:15.492713Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 0\n" + ] + }, + { + "data": { + "text/plain": [ + "2 4958\n", + "1 84\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clusters = linker_detailed.cluster_pairwise_predictions_at_threshold(\n", + " predictions, threshold_match_probability=0.99\n", + ")\n", + "df_clusters = clusters.as_pandas_dataframe().sort_values(\"cluster_id\")\n", + "df_clusters.groupby(\"cluster_id\").size().value_counts()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" - ] + "cell_type": "markdown", + "id": "6f7dbde5-c588-4930-bace-21642f250395", + "metadata": {}, + "source": [ + "In this case, we happen to know what the true links are, so we can manually inspect the ones that are doing worst to see what our model is not capturing - i.e. where we have false negatives.\n", + "\n", + "Similarly, we can look at the non-links which are performing the best, to see whether we have an issue with false positives.\n", + "\n", + "Ordinarily we would not have this luxury, and so would need to dig a bit deeper for clues as to how to improve our model, such as manually inspecting records across threshold probabilities,\n" + ] }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" + "cell_type": "code", + "execution_count": 23, + "id": "ef77a8b1-1119-4cb0-b299-343a4022d65e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:15.500107Z", + "iopub.status.busy": "2024-05-15T15:57:15.499499Z", + "iopub.status.idle": "2024-05-15T15:57:15.523366Z", + "shell.execute_reply": "2024-05-15T15:57:15.522625Z" + } + }, + "outputs": [], + "source": [ + "df_predictions[\"cluster_l\"] = df_predictions[\"rec_id_l\"].apply(\n", + " lambda x: \"-\".join(x.split(\"-\")[:2])\n", + ")\n", + "df_predictions[\"cluster_r\"] = df_predictions[\"rec_id_r\"].apply(\n", + " lambda x: \"-\".join(x.split(\"-\")[:2])\n", + ")\n", + "df_true_links = df_predictions[\n", + " df_predictions[\"cluster_l\"] == df_predictions[\"cluster_r\"]\n", + "].sort_values(\"match_probability\")" ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_simple.estimate_probability_two_random_records_match(\n", - " deterministic_rules, recall=0.8\n", - ")\n", - "linker_simple.estimate_u_using_random_sampling(max_pairs=1e7)\n", - "session_ssid = linker_simple.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"given_name\"), estimate_without_term_frequencies=True\n", - ")\n", - "session_pc = linker_simple.estimate_parameters_using_expectation_maximisation(\n", - " block_on(\"street_number\"), estimate_without_term_frequencies=True\n", - ")\n", - "linker_simple.parameter_estimate_comparisons_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "3a87cb78-0e97-40a3-b757-6c99bb19d7b1", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.180496Z", - "iopub.status.busy": "2024-05-15T15:57:04.180247Z", - "iopub.status.idle": "2024-05-15T15:57:04.183145Z", - "shell.execute_reply": "2024-05-15T15:57:04.182523Z" - } - }, - "outputs": [], - "source": [ - "# import json\n", - "# we can have a look at the full settings if we wish, including the values of our estimated parameters:\n", - "# print(json.dumps(linker_detailed._settings_obj.as_dict(), indent=2))\n", - "# we can also get a handy summary of of the model in an easily readable format if we wish:\n", - "# print(linker_detailed._settings_obj.human_readable_description)\n", - "# (we suppress output here for brevity)" - ] - }, - { - "cell_type": "markdown", - "id": "76f453df-848b-4f06-bbb7-d88ee710ae64", - "metadata": {}, - "source": [ - "We can now visualise some of the details of our models. We can look at the match weights, which tell us the relative importance for/against a match for each of our comparsion levels.\n", - "\n", - "Comparing the two models will show the added benefit we get in the more detailed model --- what in the simple model is classed as 'all other comparisons' is instead broken down further, and we can see that the detail of how this is broken down in fact gives us quite a bit of useful information about the likelihood of a match.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "b17b131c-c83e-4c32-bfad-c12021d2c3b7", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.186220Z", - "iopub.status.busy": "2024-05-15T15:57:04.185782Z", - "iopub.status.idle": "2024-05-15T15:57:04.541188Z", - "shell.execute_reply": "2024-05-15T15:57:04.540169Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 24, + "id": "bc531ca3-fe0d-480d-b059-a7125474fb22", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:15.527453Z", + "iopub.status.busy": "2024-05-15T15:57:15.527121Z", + "iopub.status.idle": "2024-05-15T15:57:16.507088Z", + "shell.execute_reply": "2024-05-15T15:57:16.506251Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.VConcatChart(...)" + "source": [ + "records_to_view = 3\n", + "linker_detailed.waterfall_chart(\n", + " df_true_links.head(records_to_view).to_dict(orient=\"records\")\n", + ")" ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_simple.match_weights_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "c095ff2b-405b-427c-849f-1468f6ca98e0", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.545921Z", - "iopub.status.busy": "2024-05-15T15:57:04.545071Z", - "iopub.status.idle": "2024-05-15T15:57:04.888788Z", - "shell.execute_reply": "2024-05-15T15:57:04.887944Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 25, + "id": "aacd9042-5672-4bc4-aa98-940d1f5fd28a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:16.510992Z", + "iopub.status.busy": "2024-05-15T15:57:16.510681Z", + "iopub.status.idle": "2024-05-15T15:57:17.322254Z", + "shell.execute_reply": "2024-05-15T15:57:17.321456Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.VConcatChart(...)" + "source": [ + "df_non_links = df_predictions[\n", + " df_predictions[\"cluster_l\"] != df_predictions[\"cluster_r\"]\n", + "].sort_values(\"match_probability\", ascending=False)\n", + "linker_detailed.waterfall_chart(\n", + " df_non_links.head(records_to_view).to_dict(orient=\"records\")\n", + ")" ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_detailed.match_weights_chart()" - ] - }, - { - "cell_type": "markdown", - "id": "e08287f4-711a-4960-b6e6-3f3d19ca8667", - "metadata": {}, - "source": [ - "As well as the match weights, which give us an idea of the overall effect of each comparison level, we can also look at the individual `u` and `m` parameter estimates, which tells us about the prevalence of coincidences and mistakes (for further details/explanation about this see [this article](https://www.robinlinacre.com/maths_of_fellegi_sunter/)). We might want to revise aspects of our model based on the information we ascertain here.\n", - "\n", - "Note however that some of these values are very small, which is why the match weight chart is often more useful for getting a decent picture of things.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "26e5dbe5-a621-44ab-bdb4-0bcd53b220b6", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:04.893722Z", - "iopub.status.busy": "2024-05-15T15:57:04.893207Z", - "iopub.status.idle": "2024-05-15T15:57:05.067224Z", - "shell.execute_reply": "2024-05-15T15:57:05.066686Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.HConcatChart(...)" + "cell_type": "markdown", + "id": "99abfc68-61a0-4290-be22-7243680b5ee1", + "metadata": {}, + "source": [ + "## Further refinements\n", + "\n", + "Looking at the non-links we have done well in having no false positives at any substantial match probability --- however looking at some of the true links we can see that there are a few that we are not capturing with sufficient match probability.\n", + "\n", + "We can see that there are a few features that we are not capturing/weighting appropriately\n", + "\n", + "- single-character transpostions, particularly in postcode (which is being lumped in with more 'severe typos'/probable non-matches)\n", + "- given/sur-names being swapped with typos\n", + "- given/sur-names being cross-matches on one only, with no match on the other cross\n", + "\n", + "We will quickly see if we can incorporate these features into a new model. As we are now going into more detail with the inter-relationship between given name and surname, it is probably no longer sensible to model them as independent comparisons, and so we will need to switch to a combined comparison on full name.\n" ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# linker_simple.m_u_parameters_chart()\n", - "linker_detailed.m_u_parameters_chart()" - ] - }, - { - "cell_type": "markdown", - "id": "67321657-d2a0-4f7c-b68a-a906a210547e", - "metadata": {}, - "source": [ - "It is also useful to have a look at unlinkable records - these are records which do not contain enough information to be linked at some match probability threshold. We can figure this out be seeing whether records are able to be matched with themselves.\n", - "\n", - "This is of course relative to the information we have put into the model - we see that in our simple model, at a 99% match threshold nearly 10% of records are unlinkable, as we have not included enough information in the model for distinct records to be adequately distinguished; this is not an issue in our more detailed model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "149962d6-a2ad-412f-aa05-8697beb12ed0", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:05.070283Z", - "iopub.status.busy": "2024-05-15T15:57:05.070040Z", - "iopub.status.idle": "2024-05-15T15:57:06.960773Z", - "shell.execute_reply": "2024-05-15T15:57:06.959848Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" + "cell_type": "code", + "execution_count": 26, + "id": "2a7229da-9f79-4151-a6b1-018d17205f5f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:17.327035Z", + "iopub.status.busy": "2024-05-15T15:57:17.326665Z", + "iopub.status.idle": "2024-05-15T15:57:17.342204Z", + "shell.execute_reply": "2024-05-15T15:57:17.341227Z" + } + }, + "outputs": [], + "source": [ + "# we need to append a full name column to our source data frames\n", + "# so that we can use it for term frequency adjustments\n", + "dfs[0][\"full_name\"] = dfs[0][\"given_name\"] + \"_\" + dfs[0][\"surname\"]\n", + "dfs[1][\"full_name\"] = dfs[1][\"given_name\"] + \"_\" + dfs[1][\"surname\"]\n", + "\n", + "\n", + "extended_model_settings = {\n", + " \"unique_id_column_name\": \"rec_id\",\n", + " \"link_type\": \"link_only\",\n", + " \"blocking_rules_to_generate_predictions\": blocking_rules,\n", + " \"comparisons\": [\n", + " {\n", + " \"output_column_name\": \"Full name\",\n", + " \"comparison_levels\": [\n", + " {\n", + " \"sql_condition\": \"(given_name_l IS NULL OR given_name_r IS NULL) and (surname_l IS NULL OR surname_r IS NULL)\",\n", + " \"label_for_charts\": \"Null\",\n", + " \"is_null_level\": True,\n", + " },\n", + " # full name match\n", + " cll.ExactMatchLevel(\"full_name\", term_frequency_adjustments=True),\n", + " # typos - keep levels across full name rather than scoring separately\n", + " cll.JaroWinklerLevel(\"full_name\", 0.9),\n", + " cll.JaroWinklerLevel(\"full_name\", 0.7),\n", + " # name switched\n", + " cll.ColumnsReversedLevel(\"given_name\", \"surname\"),\n", + " # name switched + typo\n", + " {\n", + " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.8\",\n", + " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.8\",\n", + " },\n", + " {\n", + " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.4\",\n", + " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.4\",\n", + " },\n", + " # single name match\n", + " cll.ExactMatchLevel(\"given_name\", term_frequency_adjustments=True),\n", + " cll.ExactMatchLevel(\"surname\", term_frequency_adjustments=True),\n", + " # single name cross-match\n", + " {\n", + " \"sql_condition\": \"given_name_l = surname_r OR surname_l = given_name_r\",\n", + " \"label_for_charts\": \"single name cross-matches\",\n", + " }, # single name typos\n", + " cll.JaroWinklerLevel(\"given_name\", 0.9),\n", + " cll.JaroWinklerLevel(\"surname\", 0.9),\n", + " # the rest\n", + " cll.ElseLevel(),\n", + " ],\n", + " },\n", + " ctl.DateComparison(\n", + " \"date_of_birth\",\n", + " input_is_string=True,\n", + " datetime_format=\"%Y%m%d\",\n", + " invalid_dates_as_null=True,\n", + " datetime_metrics=[\"month\", \"year\", \"year\"],\n", + " datetime_thresholds=[1, 1, 10],\n", + " ),\n", + " {\n", + " \"output_column_name\": \"Social security ID\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"soc_sec_id\"),\n", + " cll.ExactMatchLevel(\"soc_sec_id\", term_frequency_adjustments=True),\n", + " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 1),\n", + " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 2),\n", + " cll.ElseLevel(),\n", + " ],\n", + " },\n", + " {\n", + " \"output_column_name\": \"Street number\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"street_number\"),\n", + " cll.ExactMatchLevel(\"street_number\", term_frequency_adjustments=True),\n", + " cll.DamerauLevenshteinLevel(\"street_number\", 1),\n", + " cll.ElseLevel(),\n", + " ],\n", + " },\n", + " {\n", + " \"output_column_name\": \"Postcode\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"postcode\"),\n", + " cll.ExactMatchLevel(\"postcode\", term_frequency_adjustments=True),\n", + " cll.DamerauLevenshteinLevel(\"postcode\", 1),\n", + " cll.DamerauLevenshteinLevel(\"postcode\", 2),\n", + " cll.ElseLevel(),\n", + " ],\n", + " },\n", + " # we don't consider further location columns as they will be strongly correlated with postcode\n", + " ],\n", + " \"retain_intermediate_calculation_columns\": True,\n", + "}" ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_simple.unlinkables_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "cac493dd-ea43-4550-8fd4-f758ae90ed75", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:06.965159Z", - "iopub.status.busy": "2024-05-15T15:57:06.964863Z", - "iopub.status.idle": "2024-05-15T15:57:07.337075Z", - "shell.execute_reply": "2024-05-15T15:57:07.336337Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 27, + "id": "1581eeeb-246b-46de-be88-ba4dc821fce7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:57:17.346493Z", + "iopub.status.busy": "2024-05-15T15:57:17.346091Z", + "iopub.status.idle": "2024-05-15T15:58:52.238122Z", + "shell.execute_reply": "2024-05-15T15:58:52.237374Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000239.\n", + "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - Full name (no m values are trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n", + " - Social security ID (no m values are trained).\n", + " - Street number (no m values are trained).\n", + " - Postcode (no m values are trained).\n" + ] + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "# train\n", + "linker_advanced = Linker(dfs, extended_model_settings, database_api=DuckDBAPI())\n", + "linker_advanced.estimate_probability_two_random_records_match(\n", + " deterministic_rules, recall=0.8\n", + ")\n", + "# We recommend increasing target rows to 1e8 improve accuracy for u\n", + "# values in full name comparison, as we have subdivided the data more finely\n", + "\n", + "# Here, 1e7 for speed\n", + "linker_advanced.estimate_u_using_random_sampling(max_pairs=1e7)" ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_detailed.unlinkables_chart()" - ] - }, - { - "cell_type": "markdown", - "id": "66244ba3-7397-466a-889e-c85f90db1e82", - "metadata": {}, - "source": [ - "Our simple model doesn't do _terribly_, but suffers if we want to have a high match probability --- to be 99% (match weight ~7) certain of matches we have ~10% of records that we will be unable to link.\n", - "\n", - "Our detailed model, however, has enough nuance that we can at least self-link records.\n" - ] - }, - { - "cell_type": "markdown", - "id": "061e1355-557a-457d-92b6-2589b32371da", - "metadata": {}, - "source": [ - "### Predictions\n", - "\n", - "Now that we have had a look into the details of the models, we will focus on only our more detailed model, which should be able to capture more of the genuine links in our data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "03348477-c3c1-42e7-a8af-8f678acc9d58", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:07.340733Z", - "iopub.status.busy": "2024-05-15T15:57:07.340494Z", - "iopub.status.idle": "2024-05-15T15:57:12.239689Z", - "shell.execute_reply": "2024-05-15T15:57:12.238900Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " -- WARNING --\n", - "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", - "Comparison: 'date_of_birth':\n", - " m values not fully trained\n", - "Comparison: 'date_of_birth':\n", - " u values not fully trained\n" - ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrec_id_lrec_id_rgiven_name_lgiven_name_rgamma_given_nametf_given_name_l...gamma_postcodetf_postcode_ltf_postcode_rbf_postcodebf_tf_adj_postcodeaddress_1_laddress_1_rstate_lstate_rmatch_key
0-1.8239180.220244__splink__input_table_0__splink__input_table_1rec-760-orgrec-3951-dup-0lachlanlachlan30.0113...30.00070.0007791.8172091.518489bushby closetemplestoew avenuenswvic0
1-1.6562300.240853__splink__input_table_0__splink__input_table_1rec-4980-orgrec-4980-dup-0isabellactercteko00.0069...30.00040.0004791.8172092.657355sturt avenuesturta venuevicvic2
2-1.0615200.323926__splink__input_table_0__splink__input_table_1rec-585-orgrec-585-dup-0dannystephenson00.0001...20.00160.001211.7910101.000000o'shanassy streeto'shanassy streettastas1
3-0.8815660.351819__splink__input_table_0__splink__input_table_1rec-1250-orgrec-1250-dup-0lukegazzola00.0055...20.00150.000211.7910101.000000newman morris circuitnewman morr is circuitnswnsw1
4-0.1801550.468822__splink__input_table_0__splink__input_table_1rec-4763-orgrec-4763-dup-0maxalisha00.0021...10.00040.00160.0449831.000000duffy streetduffy s treetnswnsw2
\n", - "

5 rows × 47 columns

\n", - "
" + "cell_type": "code", + "execution_count": 28, + "id": "265f0651", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:58:52.244579Z", + "iopub.status.busy": "2024-05-15T15:58:52.244307Z", + "iopub.status.idle": "2024-05-15T15:58:53.189566Z", + "shell.execute_reply": "2024-05-15T15:58:53.188815Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.date_of_birth = r.date_of_birth\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - Full name\n", + " - Social security ID\n", + " - Street number\n", + " - Postcode\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - date_of_birth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.465 in the m_probability of Full name, level `Exact match on full_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.00251 in the m_probability of Social security ID, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 4.91e-05 in the m_probability of Social security ID, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - Full name (some m values are not trained).\n", + " - date_of_birth (some u values are not trained, no m values are trained).\n" + ] + } ], - "text/plain": [ - " match_weight match_probability source_dataset_l \\\n", - "0 -1.823918 0.220244 __splink__input_table_0 \n", - "1 -1.656230 0.240853 __splink__input_table_0 \n", - "2 -1.061520 0.323926 __splink__input_table_0 \n", - "3 -0.881566 0.351819 __splink__input_table_0 \n", - "4 -0.180155 0.468822 __splink__input_table_0 \n", - "\n", - " source_dataset_r rec_id_l rec_id_r given_name_l \\\n", - "0 __splink__input_table_1 rec-760-org rec-3951-dup-0 lachlan \n", - "1 __splink__input_table_1 rec-4980-org rec-4980-dup-0 isabella \n", - "2 __splink__input_table_1 rec-585-org rec-585-dup-0 danny \n", - "3 __splink__input_table_1 rec-1250-org rec-1250-dup-0 luke \n", - "4 __splink__input_table_1 rec-4763-org rec-4763-dup-0 max \n", - "\n", - " given_name_r gamma_given_name tf_given_name_l ... gamma_postcode \\\n", - "0 lachlan 3 0.0113 ... 3 \n", - "1 ctercteko 0 0.0069 ... 3 \n", - "2 stephenson 0 0.0001 ... 2 \n", - "3 gazzola 0 0.0055 ... 2 \n", - "4 alisha 0 0.0021 ... 1 \n", - "\n", - " tf_postcode_l tf_postcode_r bf_postcode bf_tf_adj_postcode \\\n", - "0 0.0007 0.0007 791.817209 1.518489 \n", - "1 0.0004 0.0004 791.817209 2.657355 \n", - "2 0.0016 0.0012 11.791010 1.000000 \n", - "3 0.0015 0.0002 11.791010 1.000000 \n", - "4 0.0004 0.0016 0.044983 1.000000 \n", - "\n", - " address_1_l address_1_r state_l state_r \\\n", - "0 bushby close templestoew avenue nsw vic \n", - "1 sturt avenue sturta venue vic vic \n", - "2 o'shanassy street o'shanassy street tas tas \n", - "3 newman morris circuit newman morr is circuit nsw nsw \n", - "4 duffy street duffy s treet nsw nsw \n", - "\n", - " match_key \n", - "0 0 \n", - "1 2 \n", - "2 1 \n", - "3 1 \n", - "4 2 \n", - "\n", - "[5 rows x 47 columns]" + "source": [ + "session_dob = linker_advanced.estimate_parameters_using_expectation_maximisation(\n", + " \"l.date_of_birth = r.date_of_birth\", estimate_without_term_frequencies=True\n", + ")" ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predictions = linker_detailed.predict(threshold_match_probability=0.2)\n", - "df_predictions = predictions.as_pandas_dataframe()\n", - "df_predictions.head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "fd32d127-5012-42e8-9f69-89237992a793", - "metadata": {}, - "source": [ - "We can see how our model performs at different probability thresholds, with a couple of options depending on the space we wish to view things\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "ce8d409c-7ef5-4485-9ec0-8b539fdecb1f", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:12.244377Z", - "iopub.status.busy": "2024-05-15T15:57:12.243938Z", - "iopub.status.idle": "2024-05-15T15:57:15.174716Z", - "shell.execute_reply": "2024-05-15T15:57:15.173769Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " -- WARNING --\n", - "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", - "Comparison: 'date_of_birth':\n", - " m values not fully trained\n", - "Comparison: 'date_of_birth':\n", - " u values not fully trained\n" - ] }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 29, + "id": "ebcb15c8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:58:53.193304Z", + "iopub.status.busy": "2024-05-15T15:58:53.193012Z", + "iopub.status.idle": "2024-05-15T15:58:54.287492Z", + "shell.execute_reply": "2024-05-15T15:58:54.286732Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.postcode = r.postcode\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - Full name\n", + " - date_of_birth\n", + " - Social security ID\n", + " - Street number\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - Postcode\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:\n", + "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was 0.0375 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was 0.000635 in the m_probability of date_of_birth, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 1.68e-05 in the m_probability of Social security ID, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - Full name (some m values are not trained).\n", + " - date_of_birth (some u values are not trained, some m values are not trained).\n" + ] + } ], - "text/plain": [ - "alt.Chart(...)" + "source": [ + "session_pc = linker_advanced.estimate_parameters_using_expectation_maximisation(\n", + " \"l.postcode = r.postcode\", estimate_without_term_frequencies=True\n", + ")" ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# linker_detailed.roc_chart_from_labels_column(\"cluster\")\n", - "linker_detailed.precision_recall_chart_from_labels_column(\n", - " \"cluster\", match_weight_round_to_nearest=0.1\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "568b990e-982a-4adc-9629-06ba30f872b0", - "metadata": {}, - "source": [ - "and we can easily see how many individuals we identify and link by looking at clusters generated at some threshold match probability of interest - in this example 99%\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ade53248-212f-4776-8d7d-4632b1749425", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:15.183049Z", - "iopub.status.busy": "2024-05-15T15:57:15.182695Z", - "iopub.status.idle": "2024-05-15T15:57:15.493444Z", - "shell.execute_reply": "2024-05-15T15:57:15.492713Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 1, root rows count 0\n" - ] }, { - "data": { - "text/plain": [ - "2 4958\n", - "1 84\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clusters = linker_detailed.cluster_pairwise_predictions_at_threshold(\n", - " predictions, threshold_match_probability=0.99\n", - ")\n", - "df_clusters = clusters.as_pandas_dataframe().sort_values(\"cluster_id\")\n", - "df_clusters.groupby(\"cluster_id\").size().value_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "6f7dbde5-c588-4930-bace-21642f250395", - "metadata": {}, - "source": [ - "In this case, we happen to know what the true links are, so we can manually inspect the ones that are doing worst to see what our model is not capturing - i.e. where we have false negatives.\n", - "\n", - "Similarly, we can look at the non-links which are performing the best, to see whether we have an issue with false positives.\n", - "\n", - "Ordinarily we would not have this luxury, and so would need to dig a bit deeper for clues as to how to improve our model, such as manually inspecting records across threshold probabilities,\n" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "ef77a8b1-1119-4cb0-b299-343a4022d65e", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:15.500107Z", - "iopub.status.busy": "2024-05-15T15:57:15.499499Z", - "iopub.status.idle": "2024-05-15T15:57:15.523366Z", - "shell.execute_reply": "2024-05-15T15:57:15.522625Z" - } - }, - "outputs": [], - "source": [ - "df_predictions[\"cluster_l\"] = df_predictions[\"rec_id_l\"].apply(\n", - " lambda x: \"-\".join(x.split(\"-\")[:2])\n", - ")\n", - "df_predictions[\"cluster_r\"] = df_predictions[\"rec_id_r\"].apply(\n", - " lambda x: \"-\".join(x.split(\"-\")[:2])\n", - ")\n", - "df_true_links = df_predictions[\n", - " df_predictions[\"cluster_l\"] == df_predictions[\"cluster_r\"]\n", - "].sort_values(\"match_probability\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "bc531ca3-fe0d-480d-b059-a7125474fb22", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:15.527453Z", - "iopub.status.busy": "2024-05-15T15:57:15.527121Z", - "iopub.status.idle": "2024-05-15T15:57:16.507088Z", - "shell.execute_reply": "2024-05-15T15:57:16.506251Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 30, + "id": "d9d21e85-b89b-435a-8b75-142166ac3f31", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:58:54.292571Z", + "iopub.status.busy": "2024-05-15T15:58:54.292308Z", + "iopub.status.idle": "2024-05-15T15:58:54.443712Z", + "shell.execute_reply": "2024-05-15T15:58:54.443023Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "linker_advanced.parameter_estimate_comparisons_chart()" ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "records_to_view = 3\n", - "linker_detailed.waterfall_chart(\n", - " df_true_links.head(records_to_view).to_dict(orient=\"records\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "aacd9042-5672-4bc4-aa98-940d1f5fd28a", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:16.510992Z", - "iopub.status.busy": "2024-05-15T15:57:16.510681Z", - "iopub.status.idle": "2024-05-15T15:57:17.322254Z", - "shell.execute_reply": "2024-05-15T15:57:17.321456Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 31, + "id": "4a857c18-b0d5-48dc-b7f1-1f6389db5089", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:58:54.447134Z", + "iopub.status.busy": "2024-05-15T15:58:54.446857Z", + "iopub.status.idle": "2024-05-15T15:58:54.770678Z", + "shell.execute_reply": "2024-05-15T15:58:54.770024Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "linker_advanced.match_weights_chart()" ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_non_links = df_predictions[\n", - " df_predictions[\"cluster_l\"] != df_predictions[\"cluster_r\"]\n", - "].sort_values(\"match_probability\", ascending=False)\n", - "linker_detailed.waterfall_chart(\n", - " df_non_links.head(records_to_view).to_dict(orient=\"records\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "99abfc68-61a0-4290-be22-7243680b5ee1", - "metadata": {}, - "source": [ - "## Further refinements\n", - "\n", - "Looking at the non-links we have done well in having no false positives at any substantial match probability --- however looking at some of the true links we can see that there are a few that we are not capturing with sufficient match probability.\n", - "\n", - "We can see that there are a few features that we are not capturing/weighting appropriately\n", - "\n", - "- single-character transpostions, particularly in postcode (which is being lumped in with more 'severe typos'/probable non-matches)\n", - "- given/sur-names being swapped with typos\n", - "- given/sur-names being cross-matches on one only, with no match on the other cross\n", - "\n", - "We will quickly see if we can incorporate these features into a new model. As we are now going into more detail with the inter-relationship between given name and surname, it is probably no longer sensible to model them as independent comparisons, and so we will need to switch to a combined comparison on full name.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "2a7229da-9f79-4151-a6b1-018d17205f5f", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:17.327035Z", - "iopub.status.busy": "2024-05-15T15:57:17.326665Z", - "iopub.status.idle": "2024-05-15T15:57:17.342204Z", - "shell.execute_reply": "2024-05-15T15:57:17.341227Z" - } - }, - "outputs": [], - "source": [ - "# we need to append a full name column to our source data frames\n", - "# so that we can use it for term frequency adjustments\n", - "dfs[0][\"full_name\"] = dfs[0][\"given_name\"] + \"_\" + dfs[0][\"surname\"]\n", - "dfs[1][\"full_name\"] = dfs[1][\"given_name\"] + \"_\" + dfs[1][\"surname\"]\n", - "\n", - "\n", - "extended_model_settings = {\n", - " \"unique_id_column_name\": \"rec_id\",\n", - " \"link_type\": \"link_only\",\n", - " \"blocking_rules_to_generate_predictions\": blocking_rules,\n", - " \"comparisons\": [\n", - " {\n", - " \"output_column_name\": \"Full name\",\n", - " \"comparison_levels\": [\n", - " {\n", - " \"sql_condition\": \"(given_name_l IS NULL OR given_name_r IS NULL) and (surname_l IS NULL OR surname_r IS NULL)\",\n", - " \"label_for_charts\": \"Null\",\n", - " \"is_null_level\": True,\n", - " },\n", - " # full name match\n", - " cll.ExactMatchLevel(\"full_name\", term_frequency_adjustments=True),\n", - " # typos - keep levels across full name rather than scoring separately\n", - " cll.JaroWinklerLevel(\"full_name\", 0.9),\n", - " cll.JaroWinklerLevel(\"full_name\", 0.7),\n", - " # name switched\n", - " cll.ColumnsReversedLevel(\"given_name\", \"surname\"),\n", - " # name switched + typo\n", - " {\n", - " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.8\",\n", - " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.8\",\n", - " },\n", - " {\n", - " \"sql_condition\": \"jaro_winkler_similarity(given_name_l, surname_r) + jaro_winkler_similarity(surname_l, given_name_r) >= 1.4\",\n", - " \"label_for_charts\": \"switched + jaro_winkler_similarity >= 1.4\",\n", - " },\n", - " # single name match\n", - " cll.ExactMatchLevel(\"given_name\", term_frequency_adjustments=True),\n", - " cll.ExactMatchLevel(\"surname\", term_frequency_adjustments=True),\n", - " # single name cross-match\n", - " {\n", - " \"sql_condition\": \"given_name_l = surname_r OR surname_l = given_name_r\",\n", - " \"label_for_charts\": \"single name cross-matches\",\n", - " }, # single name typos\n", - " cll.JaroWinklerLevel(\"given_name\", 0.9),\n", - " cll.JaroWinklerLevel(\"surname\", 0.9),\n", - " # the rest\n", - " cll.ElseLevel(),\n", - " ],\n", - " },\n", - " ctl.DateComparison(\n", - " \"date_of_birth\",\n", - " input_is_string=True,\n", - " datetime_format=\"%Y%m%d\",\n", - " invalid_dates_as_null=True,\n", - " datetime_metrics=[\"month\", \"year\", \"year\"],\n", - " datetime_thresholds=[1, 1, 10],\n", - " ),\n", - " {\n", - " \"output_column_name\": \"Social security ID\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"soc_sec_id\"),\n", - " cll.ExactMatchLevel(\"soc_sec_id\", term_frequency_adjustments=True),\n", - " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 1),\n", - " cll.DamerauLevenshteinLevel(\"soc_sec_id\", 2),\n", - " cll.ElseLevel(),\n", - " ],\n", - " },\n", - " {\n", - " \"output_column_name\": \"Street number\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"street_number\"),\n", - " cll.ExactMatchLevel(\"street_number\", term_frequency_adjustments=True),\n", - " cll.DamerauLevenshteinLevel(\"street_number\", 1),\n", - " cll.ElseLevel(),\n", - " ],\n", - " },\n", - " {\n", - " \"output_column_name\": \"Postcode\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"postcode\"),\n", - " cll.ExactMatchLevel(\"postcode\", term_frequency_adjustments=True),\n", - " cll.DamerauLevenshteinLevel(\"postcode\", 1),\n", - " cll.DamerauLevenshteinLevel(\"postcode\", 2),\n", - " cll.ElseLevel(),\n", - " ],\n", - " },\n", - " # we don't consider further location columns as they will be strongly correlated with postcode\n", - " ],\n", - " \"retain_intermediate_calculation_columns\": True,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "1581eeeb-246b-46de-be88-ba4dc821fce7", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:57:17.346493Z", - "iopub.status.busy": "2024-05-15T15:57:17.346091Z", - "iopub.status.idle": "2024-05-15T15:58:52.238122Z", - "shell.execute_reply": "2024-05-15T15:58:52.237374Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.000239.\n", - "This means that amongst all possible pairwise record comparisons, one in 4,185.85 are expected to match. With 25,000,000 total possible comparisons, we expect a total of around 5,972.50 matching pairs\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "u probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - Full name (no m values are trained).\n", - " - date_of_birth (some u values are not trained, no m values are trained).\n", - " - Social security ID (no m values are trained).\n", - " - Street number (no m values are trained).\n", - " - Postcode (no m values are trained).\n" - ] - } - ], - "source": [ - "# train\n", - "linker_advanced = Linker(dfs, extended_model_settings, database_api=DuckDBAPI())\n", - "linker_advanced.estimate_probability_two_random_records_match(\n", - " deterministic_rules, recall=0.8\n", - ")\n", - "# We recommend increasing target rows to 1e8 improve accuracy for u\n", - "# values in full name comparison, as we have subdivided the data more finely\n", - "\n", - "# Here, 1e7 for speed\n", - "linker_advanced.estimate_u_using_random_sampling(max_pairs=1e7)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "265f0651", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:58:52.244579Z", - "iopub.status.busy": "2024-05-15T15:58:52.244307Z", - "iopub.status.idle": "2024-05-15T15:58:53.189566Z", - "shell.execute_reply": "2024-05-15T15:58:53.188815Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.date_of_birth = r.date_of_birth\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - Full name\n", - " - Social security ID\n", - " - Street number\n", - " - Postcode\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - date_of_birth\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.465 in the m_probability of Full name, level `Exact match on full_name`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was 0.00251 in the m_probability of Social security ID, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 4.91e-05 in the m_probability of Social security ID, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 3 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - Full name (some m values are not trained).\n", - " - date_of_birth (some u values are not trained, no m values are trained).\n" - ] - } - ], - "source": [ - "session_dob = linker_advanced.estimate_parameters_using_expectation_maximisation(\n", - " \"l.date_of_birth = r.date_of_birth\", estimate_without_term_frequencies=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "ebcb15c8", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:58:53.193304Z", - "iopub.status.busy": "2024-05-15T15:58:53.193012Z", - "iopub.status.idle": "2024-05-15T15:58:54.287492Z", - "shell.execute_reply": "2024-05-15T15:58:54.286732Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.postcode = r.postcode\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - Full name\n", - " - date_of_birth\n", - " - Social security ID\n", - " - Street number\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - Postcode\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level single name cross-matches on comparison Full name not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level Abs difference of 'transformed date_of_birth <= 1 month' on comparison date_of_birth not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level Abs difference of 'transformed date_of_birth <= 1 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:\n", - "Level Abs difference of 'transformed date_of_birth <= 10 year' on comparison date_of_birth not observed in dataset, unable to train m value\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was 0.0375 in the m_probability of date_of_birth, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was 0.000635 in the m_probability of date_of_birth, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 1.68e-05 in the m_probability of Social security ID, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 3 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for Full name - single name cross-matches (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 month' (comparison vector value: 3). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 1 year' (comparison vector value: 2). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "m probability not trained for date_of_birth - Abs difference of 'transformed date_of_birth <= 10 year' (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - Full name (some m values are not trained).\n", - " - date_of_birth (some u values are not trained, some m values are not trained).\n" - ] - } - ], - "source": [ - "session_pc = linker_advanced.estimate_parameters_using_expectation_maximisation(\n", - " \"l.postcode = r.postcode\", estimate_without_term_frequencies=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "d9d21e85-b89b-435a-8b75-142166ac3f31", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:58:54.292571Z", - "iopub.status.busy": "2024-05-15T15:58:54.292308Z", - "iopub.status.idle": "2024-05-15T15:58:54.443712Z", - "shell.execute_reply": "2024-05-15T15:58:54.443023Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 32, + "id": "e1ee24d9-1def-4b8d-bb85-1c63b595e75e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T15:58:54.773893Z", + "iopub.status.busy": "2024-05-15T15:58:54.773655Z", + "iopub.status.idle": "2024-05-15T15:58:56.607253Z", + "shell.execute_reply": "2024-05-15T15:58:56.606584Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " -- WARNING --\n", + "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", + "Comparison: 'Full name':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " m values not fully trained\n", + "Comparison: 'date_of_birth':\n", + " u values not fully trained\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 0\n" + ] + }, + { + "data": { + "text/plain": [ + "2 4960\n", + "1 80\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.Chart(...)" + "source": [ + "predictions_adv = linker_advanced.predict()\n", + "df_predictions_adv = predictions_adv.as_pandas_dataframe()\n", + "clusters_adv = linker_advanced.cluster_pairwise_predictions_at_threshold(\n", + " predictions_adv, threshold_match_probability=0.99\n", + ")\n", + "df_clusters_adv = clusters_adv.as_pandas_dataframe().sort_values(\"cluster_id\")\n", + "df_clusters_adv.groupby(\"cluster_id\").size().value_counts()" ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_advanced.parameter_estimate_comparisons_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "4a857c18-b0d5-48dc-b7f1-1f6389db5089", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:58:54.447134Z", - "iopub.status.busy": "2024-05-15T15:58:54.446857Z", - "iopub.status.idle": "2024-05-15T15:58:54.770678Z", - "shell.execute_reply": "2024-05-15T15:58:54.770024Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" + "cell_type": "markdown", + "id": "8db464e7-c57b-48e1-9e7b-c01ce9ccbad9", + "metadata": {}, + "source": [ + "This is a pretty modest improvement on our previous model - however it is worth re-iterating that we should not necessarily expect to recover _all_ matches, as in several cases it may be unreasonable for a model to have reasonable confidence that two records refer to the same entity.\n", + "\n", + "If we wished to improve matters we could iterate on this process - investigating where our model is not performing as we would hope, and seeing how we can adjust these areas to address these shortcomings.\n" ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker_advanced.match_weights_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "e1ee24d9-1def-4b8d-bb85-1c63b595e75e", - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T15:58:54.773893Z", - "iopub.status.busy": "2024-05-15T15:58:54.773655Z", - "iopub.status.idle": "2024-05-15T15:58:56.607253Z", - "shell.execute_reply": "2024-05-15T15:58:56.606584Z" } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " -- WARNING --\n", - "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", - "Comparison: 'Full name':\n", - " m values not fully trained\n", - "Comparison: 'date_of_birth':\n", - " m values not fully trained\n", - "Comparison: 'date_of_birth':\n", - " u values not fully trained\n" - ] + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 1, root rows count 0\n" - ] + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" }, - { - "data": { - "text/plain": [ - "2 4960\n", - "1 80\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "1cd1512b68bf43868e26a4c0fa908d4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "3540e7572a2e497c8837e9038728b244": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "4eab2071171e419a8f9ddbd6a12f12e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_d9ed287ce1f146c09c8a0e89a7bd9855", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9aae753cd7a54a2d94be6496b1812b3c", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "73555dbfc04c485fb9c6d09bc677f843": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "886192be2bdf4a88a1d8808f1db44fb2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_73555dbfc04c485fb9c6d09bc677f843", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3540e7572a2e497c8837e9038728b244", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "8906edf4488846fb908d17be3dc5440f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "9aae753cd7a54a2d94be6496b1812b3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "cda32ebef57a4bcb9b8d2d531ac2b32a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_8906edf4488846fb908d17be3dc5440f", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1cd1512b68bf43868e26a4c0fa908d4e", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "d9ed287ce1f146c09c8a0e89a7bd9855": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + } + }, + "version_major": 2, + "version_minor": 0 + } } - ], - "source": [ - "predictions_adv = linker_advanced.predict()\n", - "df_predictions_adv = predictions_adv.as_pandas_dataframe()\n", - "clusters_adv = linker_advanced.cluster_pairwise_predictions_at_threshold(\n", - " predictions_adv, threshold_match_probability=0.99\n", - ")\n", - "df_clusters_adv = clusters_adv.as_pandas_dataframe().sort_values(\"cluster_id\")\n", - "df_clusters_adv.groupby(\"cluster_id\").size().value_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "8db464e7-c57b-48e1-9e7b-c01ce9ccbad9", - "metadata": {}, - "source": [ - "This is a pretty modest improvement on our previous model - however it is worth re-iterating that we should not necessarily expect to recover _all_ matches, as in several cases it may be unreasonable for a model to have reasonable confidence that two records refer to the same entity.\n", - "\n", - "If we wished to improve matters we could iterate on this process - investigating where our model is not performing as we would hope, and seeing how we can adjust these areas to address these shortcomings.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "1cd1512b68bf43868e26a4c0fa908d4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "3540e7572a2e497c8837e9038728b244": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "4eab2071171e419a8f9ddbd6a12f12e4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_d9ed287ce1f146c09c8a0e89a7bd9855", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9aae753cd7a54a2d94be6496b1812b3c", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "73555dbfc04c485fb9c6d09bc677f843": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "886192be2bdf4a88a1d8808f1db44fb2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_73555dbfc04c485fb9c6d09bc677f843", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3540e7572a2e497c8837e9038728b244", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "8906edf4488846fb908d17be3dc5440f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "9aae753cd7a54a2d94be6496b1812b3c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "cda32ebef57a4bcb9b8d2d531ac2b32a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_8906edf4488846fb908d17be3dc5440f", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1cd1512b68bf43868e26a4c0fa908d4e", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "d9ed287ce1f146c09c8a0e89a7bd9855": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - } - }, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/demos/examples/duckdb/transactions.ipynb b/docs/demos/examples/duckdb/transactions.ipynb index 118aa373e3..6db3209da8 100644 --- a/docs/demos/examples/duckdb/transactions.ipynb +++ b/docs/demos/examples/duckdb/transactions.ipynb @@ -1,1659 +1,1659 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking banking transactions\n", - "\n", - "This example shows how to perform a one-to-one link on banking transactions.\n", - "\n", - "The data is fake data, and was generated has the following features:\n", - "\n", - "- Money shows up in the destination account with some time delay\n", - "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n", - "- The memo is sometimes truncated and content is sometimes missing\n", - "\n", - "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:14.252200Z", - "iopub.status.busy": "2024-05-16T12:13:14.251497Z", - "iopub.status.idle": "2024-05-16T12:13:14.257616Z", - "shell.execute_reply": "2024-05-16T12:13:14.256908Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:14.261383Z", - "iopub.status.busy": "2024-05-16T12:13:14.261079Z", - "iopub.status.idle": "2024-05-16T12:13:16.084252Z", - "shell.execute_reply": "2024-05-16T12:13:16.083429Z" - } - }, - "outputs": [ + "cells": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C paym2022-03-2836.360
11M CORVINUS dona2022-02-14221.911
\n", - "
" - ], - "text/plain": [ - " ground_truth memo transaction_date amount unique_id\n", - "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n", - "1 1 M CORVINUS dona 2022-02-14 221.91 1" + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking banking transactions\n", + "\n", + "This example shows how to perform a one-to-one link on banking transactions.\n", + "\n", + "The data is fake data, and was generated has the following features:\n", + "\n", + "- Money shows up in the destination account with some time delay\n", + "- The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects\n", + "- The memo is sometimes truncated and content is sometimes missing\n", + "\n", + "Since each origin payment should end up in the destination account, the `probability_two_random_records_match` of the model is known.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C payment BGC2022-03-2936.360
11M CORVINUS BGC2022-02-16221.911
\n", - "
" - ], - "text/plain": [ - " ground_truth memo transaction_date amount unique_id\n", - "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n", - "1 1 M CORVINUS BGC 2022-02-16 221.91 1" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "\n" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n", - "\n", - "df_origin = splink_datasets.transactions_origin\n", - "df_destination = splink_datasets.transactions_destination\n", - "\n", - "display(df_origin.head(2))\n", - "display(df_destination.head(2))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the following chart, we can see this is a challenging dataset to link:\n", - "\n", - "- There are only 151 distinct transaction dates, with strong skew\n", - "- Some 'memos' are used multiple times (up to 48 times)\n", - "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:16.143823Z", - "iopub.status.busy": "2024-05-16T12:13:16.143431Z", - "iopub.status.idle": "2024-05-16T12:13:16.849535Z", - "shell.execute_reply": "2024-05-16T12:13:16.848871Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:14.252200Z", + "iopub.status.busy": "2024-05-16T12:13:14.251497Z", + "iopub.status.idle": "2024-05-16T12:13:14.257616Z", + "shell.execute_reply": "2024-05-16T12:13:14.256908Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink.exploratory import profile_columns\n", - "\n", - "db_api = DuckDBAPI()\n", - "profile_columns(\n", - " [df_origin, df_destination],\n", - " db_api=db_api,\n", - " column_expressions=[\n", - " \"memo\",\n", - " \"transaction_date\",\n", - " \"amount\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:16.852855Z", - "iopub.status.busy": "2024-05-16T12:13:16.852594Z", - "iopub.status.idle": "2024-05-16T12:13:18.407824Z", - "shell.execute_reply": "2024-05-16T12:13:18.407265Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:14.261383Z", + "iopub.status.busy": "2024-05-16T12:13:14.261079Z", + "iopub.status.idle": "2024-05-16T12:13:16.084252Z", + "shell.execute_reply": "2024-05-16T12:13:16.083429Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C paym2022-03-2836.360
11M CORVINUS dona2022-02-14221.911
\n", + "
" + ], + "text/plain": [ + " ground_truth memo transaction_date amount unique_id\n", + "0 0 MATTHIAS C paym 2022-03-28 36.36 0\n", + "1 1 M CORVINUS dona 2022-02-14 221.91 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthmemotransaction_dateamountunique_id
00MATTHIAS C payment BGC2022-03-2936.360
11M CORVINUS BGC2022-02-16221.911
\n", + "
" + ], + "text/plain": [ + " ground_truth memo transaction_date amount unique_id\n", + "0 0 MATTHIAS C payment BGC 2022-03-29 36.36 0\n", + "1 1 M CORVINUS BGC 2022-02-16 221.91 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } ], - "text/plain": [ - "alt.Chart(...)" + "source": [ + "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n", + "\n", + "df_origin = splink_datasets.transactions_origin\n", + "df_destination = splink_datasets.transactions_destination\n", + "\n", + "display(df_origin.head(2))\n", + "display(df_destination.head(2))" ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import DuckDBAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "# Design blocking rules that allow for differences in transaction date and amounts\n", - "blocking_rule_date_1 = \"\"\"\n", - " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", - " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", - " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", - "\"\"\"\n", - "\n", - "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n", - "blocking_rule_date_2 = \"\"\"\n", - " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", - " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", - " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", - "\"\"\"\n", - "\n", - "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n", - "\n", - "blocking_rule_amount_1 = \"\"\"\n", - "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n", - "\"\"\"\n", - "\n", - "blocking_rule_amount_2 = \"\"\"\n", - "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n", - "\"\"\"\n", - "\n", - "blocking_rule_cheat = block_on(\"unique_id\")\n", - "\n", - "\n", - "brs = [\n", - " blocking_rule_date_1,\n", - " blocking_rule_date_2,\n", - " blocking_rule_memo,\n", - " blocking_rule_amount_1,\n", - " blocking_rule_amount_2,\n", - " blocking_rule_cheat,\n", - "]\n", - "\n", - "\n", - "db_api = DuckDBAPI()\n", - "\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=[df_origin, df_destination],\n", - " blocking_rule_creators=brs,\n", - " db_api=db_api,\n", - " link_type=\"link_only\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:18.411066Z", - "iopub.status.busy": "2024-05-16T12:13:18.410832Z", - "iopub.status.idle": "2024-05-16T12:13:18.418094Z", - "shell.execute_reply": "2024-05-16T12:13:18.416984Z" - } - }, - "outputs": [], - "source": [ - "# Full settings for linking model\n", - "import splink.comparison_level_library as cll\n", - "import splink.comparison_library as cl\n", - "\n", - "comparison_amount = {\n", - " \"output_column_name\": \"amount\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"amount\"),\n", - " cll.ExactMatchLevel(\"amount\"),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n", - " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n", - " cll.ElseLevel(),\n", - " ],\n", - " \"comparison_description\": \"Amount percentage difference\",\n", - "}\n", - "\n", - "# The date distance is one sided becaause transactions should only arrive after they've left\n", - "# As a result, the comparison_template_library date difference functions are not appropriate\n", - "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n", - "\n", - "comparison_date = {\n", - " \"output_column_name\": \"transaction_date\",\n", - " \"comparison_levels\": [\n", - " cll.NullLevel(\"transaction_date\"),\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=1),\n", - " \"label_for_charts\": \"1 day\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=4),\n", - " \"label_for_charts\": \"<=4 days\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=10),\n", - " \"label_for_charts\": \"<=10 days\",\n", - " },\n", - " {\n", - " \"sql_condition\": within_n_days_template.format(n=30),\n", - " \"label_for_charts\": \"<=30 days\",\n", - " },\n", - " cll.ElseLevel(),\n", - " ],\n", - " \"comparison_description\": \"Transaction date days apart\",\n", - "}\n", - "\n", - "\n", - "settings = SettingsCreator(\n", - " link_type=\"link_only\",\n", - " probability_two_random_records_match=1 / len(df_origin),\n", - " blocking_rules_to_generate_predictions=[\n", - " blocking_rule_date_1,\n", - " blocking_rule_date_2,\n", - " blocking_rule_memo,\n", - " blocking_rule_amount_1,\n", - " blocking_rule_amount_2,\n", - " blocking_rule_cheat,\n", - " ],\n", - " comparisons=[\n", - " comparison_amount,\n", - " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n", - " comparison_date,\n", - " ],\n", - " retain_intermediate_calculation_columns=True,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:18.421517Z", - "iopub.status.busy": "2024-05-16T12:13:18.421286Z", - "iopub.status.idle": "2024-05-16T12:13:18.552970Z", - "shell.execute_reply": "2024-05-16T12:13:18.552184Z" - } - }, - "outputs": [], - "source": [ - "linker = Linker(\n", - " [df_origin, df_destination],\n", - " settings,\n", - " input_table_aliases=[\"__ori\", \"_dest\"],\n", - " database_api=db_api,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:18.556284Z", - "iopub.status.busy": "2024-05-16T12:13:18.556053Z", - "iopub.status.idle": "2024-05-16T12:13:20.529952Z", - "shell.execute_reply": "2024-05-16T12:13:20.529065Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - amount (no m values are trained).\n", - " - memo (no m values are trained).\n", - " - transaction_date (no m values are trained).\n" - ] - } - ], - "source": [ - "linker.estimate_u_using_random_sampling(max_pairs=1e6)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:20.532832Z", - "iopub.status.busy": "2024-05-16T12:13:20.532606Z", - "iopub.status.idle": "2024-05-16T12:13:21.867808Z", - "shell.execute_reply": "2024-05-16T12:13:21.867084Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"memo\" = r.\"memo\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - amount\n", - " - transaction_date\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - memo\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.58 in the m_probability of amount, level `Exact match on amount`\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.185 in the m_probability of transaction_date, level `1 day`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0104 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was 0.00228 in the m_probability of transaction_date, level `<=30 days`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.000398 in the m_probability of transaction_date, level `<=30 days`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was -0.00024 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was -0.000218 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was -0.000199 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was -0.000182 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was -0.000167 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was -0.000154 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 12: Largest change in params was -0.000142 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 13: Largest change in params was -0.000132 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 14: Largest change in params was -0.000122 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 15: Largest change in params was -0.000114 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 16: Largest change in params was -0.000106 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 17: Largest change in params was -9.92e-05 in the m_probability of amount, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 17 iterations\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - memo (no m values are trained).\n" - ] - }, - { - "data": { - "text/plain": [ - "" + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following chart, we can see this is a challenging dataset to link:\n", + "\n", + "- There are only 151 distinct transaction dates, with strong skew\n", + "- Some 'memos' are used multiple times (up to 48 times)\n", + "- There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00\n" ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:21.871283Z", - "iopub.status.busy": "2024-05-16T12:13:21.871004Z", - "iopub.status.idle": "2024-05-16T12:13:23.094606Z", - "shell.execute_reply": "2024-05-16T12:13:23.093838Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.\"amount\" = r.\"amount\"\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - memo\n", - " - transaction_date\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - amount\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.435 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.0613 in the m_probability of memo, level `Exact match on memo`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was 0.0114 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was 0.00446 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 5: Largest change in params was 0.00436 in the m_probability of memo, level `All other comparisons`\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 6: Largest change in params was 0.00398 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 7: Largest change in params was 0.00345 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 8: Largest change in params was 0.00287 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 9: Largest change in params was 0.00231 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 10: Largest change in params was 0.00181 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 11: Largest change in params was 0.00139 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 12: Largest change in params was 0.00105 in the m_probability of memo, level `All other comparisons`\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 13: Largest change in params was 0.000789 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:16.143823Z", + "iopub.status.busy": "2024-05-16T12:13:16.143431Z", + "iopub.status.idle": "2024-05-16T12:13:16.849535Z", + "shell.execute_reply": "2024-05-16T12:13:16.848871Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.exploratory import profile_columns\n", + "\n", + "db_api = DuckDBAPI()\n", + "profile_columns(\n", + " [df_origin, df_destination],\n", + " db_api=db_api,\n", + " column_expressions=[\n", + " \"memo\",\n", + " \"transaction_date\",\n", + " \"amount\",\n", + " ],\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 14: Largest change in params was 0.000586 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:16.852855Z", + "iopub.status.busy": "2024-05-16T12:13:16.852594Z", + "iopub.status.idle": "2024-05-16T12:13:18.407824Z", + "shell.execute_reply": "2024-05-16T12:13:18.407265Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import DuckDBAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "# Design blocking rules that allow for differences in transaction date and amounts\n", + "blocking_rule_date_1 = \"\"\"\n", + " strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", + " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", + " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", + "\"\"\"\n", + "\n", + "# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb\n", + "blocking_rule_date_2 = \"\"\"\n", + " strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')\n", + " and substr(l.memo, 1,3) = substr(r.memo,1,3)\n", + " and l.amount/r.amount > 0.7 and l.amount/r.amount < 1.3\n", + "\"\"\"\n", + "\n", + "blocking_rule_memo = block_on(\"substr(memo,1,9)\")\n", + "\n", + "blocking_rule_amount_1 = \"\"\"\n", + "round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)\n", + "\"\"\"\n", + "\n", + "blocking_rule_amount_2 = \"\"\"\n", + "round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)\n", + "\"\"\"\n", + "\n", + "blocking_rule_cheat = block_on(\"unique_id\")\n", + "\n", + "\n", + "brs = [\n", + " blocking_rule_date_1,\n", + " blocking_rule_date_2,\n", + " blocking_rule_memo,\n", + " blocking_rule_amount_1,\n", + " blocking_rule_amount_2,\n", + " blocking_rule_cheat,\n", + "]\n", + "\n", + "\n", + "db_api = DuckDBAPI()\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[df_origin, df_destination],\n", + " blocking_rules=brs,\n", + " db_api=db_api,\n", + " link_type=\"link_only\"\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 15: Largest change in params was 0.000433 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:18.411066Z", + "iopub.status.busy": "2024-05-16T12:13:18.410832Z", + "iopub.status.idle": "2024-05-16T12:13:18.418094Z", + "shell.execute_reply": "2024-05-16T12:13:18.416984Z" + } + }, + "outputs": [], + "source": [ + "# Full settings for linking model\n", + "import splink.comparison_level_library as cll\n", + "import splink.comparison_library as cl\n", + "\n", + "comparison_amount = {\n", + " \"output_column_name\": \"amount\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"amount\"),\n", + " cll.ExactMatchLevel(\"amount\"),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.01),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.03),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.1),\n", + " cll.PercentageDifferenceLevel(\"amount\", 0.3),\n", + " cll.ElseLevel(),\n", + " ],\n", + " \"comparison_description\": \"Amount percentage difference\",\n", + "}\n", + "\n", + "# The date distance is one sided becaause transactions should only arrive after they've left\n", + "# As a result, the comparison_template_library date difference functions are not appropriate\n", + "within_n_days_template = \"transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l\"\n", + "\n", + "comparison_date = {\n", + " \"output_column_name\": \"transaction_date\",\n", + " \"comparison_levels\": [\n", + " cll.NullLevel(\"transaction_date\"),\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=1),\n", + " \"label_for_charts\": \"1 day\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=4),\n", + " \"label_for_charts\": \"<=4 days\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=10),\n", + " \"label_for_charts\": \"<=10 days\",\n", + " },\n", + " {\n", + " \"sql_condition\": within_n_days_template.format(n=30),\n", + " \"label_for_charts\": \"<=30 days\",\n", + " },\n", + " cll.ElseLevel(),\n", + " ],\n", + " \"comparison_description\": \"Transaction date days apart\",\n", + "}\n", + "\n", + "\n", + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " probability_two_random_records_match=1 / len(df_origin),\n", + " blocking_rules_to_generate_predictions=[\n", + " blocking_rule_date_1,\n", + " blocking_rule_date_2,\n", + " blocking_rule_memo,\n", + " blocking_rule_amount_1,\n", + " blocking_rule_amount_2,\n", + " blocking_rule_cheat,\n", + " ],\n", + " comparisons=[\n", + " comparison_amount,\n", + " cl.LevenshteinAtThresholds(\"memo\", [2, 6, 10]),\n", + " comparison_date,\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 16: Largest change in params was 0.000318 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:18.421517Z", + "iopub.status.busy": "2024-05-16T12:13:18.421286Z", + "iopub.status.idle": "2024-05-16T12:13:18.552970Z", + "shell.execute_reply": "2024-05-16T12:13:18.552184Z" + } + }, + "outputs": [], + "source": [ + "linker = Linker(\n", + " [df_origin, df_destination],\n", + " settings,\n", + " input_table_aliases=[\"__ori\", \"_dest\"],\n", + " database_api=db_api,\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 17: Largest change in params was 0.000233 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:18.556284Z", + "iopub.status.busy": "2024-05-16T12:13:18.556053Z", + "iopub.status.idle": "2024-05-16T12:13:20.529952Z", + "shell.execute_reply": "2024-05-16T12:13:20.529065Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - amount (no m values are trained).\n", + " - memo (no m values are trained).\n", + " - transaction_date (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.estimate_u_using_random_sampling(max_pairs=1e6)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 18: Largest change in params was 0.00017 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:20.532832Z", + "iopub.status.busy": "2024-05-16T12:13:20.532606Z", + "iopub.status.idle": "2024-05-16T12:13:21.867808Z", + "shell.execute_reply": "2024-05-16T12:13:21.867084Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"memo\" = r.\"memo\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - amount\n", + " - transaction_date\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - memo\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.58 in the m_probability of amount, level `Exact match on amount`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.185 in the m_probability of transaction_date, level `1 day`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0104 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00228 in the m_probability of transaction_date, level `<=30 days`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.000398 in the m_probability of transaction_date, level `<=30 days`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was -0.00024 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was -0.000218 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was -0.000199 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was -0.000182 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was -0.000167 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was -0.000154 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was -0.000142 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was -0.000132 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was -0.000122 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was -0.000114 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was -0.000106 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was -9.92e-05 in the m_probability of amount, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 17 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - memo (no m values are trained).\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.estimate_parameters_using_expectation_maximisation(block_on(\"memo\"))" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 19: Largest change in params was 0.000124 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:21.871283Z", + "iopub.status.busy": "2024-05-16T12:13:21.871004Z", + "iopub.status.idle": "2024-05-16T12:13:23.094606Z", + "shell.execute_reply": "2024-05-16T12:13:23.093838Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.\"amount\" = r.\"amount\"\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - memo\n", + " - transaction_date\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - amount\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.435 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0613 in the m_probability of memo, level `Exact match on memo`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was 0.0114 in the m_probability of memo, level `Levenshtein distance of memo <= 10`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was 0.00446 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 5: Largest change in params was 0.00436 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 6: Largest change in params was 0.00398 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 7: Largest change in params was 0.00345 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 8: Largest change in params was 0.00287 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 9: Largest change in params was 0.00231 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 10: Largest change in params was 0.00181 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 11: Largest change in params was 0.00139 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 12: Largest change in params was 0.00105 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 13: Largest change in params was 0.000789 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 14: Largest change in params was 0.000586 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 15: Largest change in params was 0.000433 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 16: Largest change in params was 0.000318 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 17: Largest change in params was 0.000233 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 18: Largest change in params was 0.00017 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 19: Largest change in params was 0.000124 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 20: Largest change in params was 9.05e-05 in the m_probability of memo, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 20 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "session = linker.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 20: Largest change in params was 9.05e-05 in the m_probability of memo, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:23.097922Z", + "iopub.status.busy": "2024-05-16T12:13:23.097670Z", + "iopub.status.idle": "2024-05-16T12:13:23.382589Z", + "shell.execute_reply": "2024-05-16T12:13:23.382014Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.match_weights_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 20 iterations\n" - ] + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:23.385651Z", + "iopub.status.busy": "2024-05-16T12:13:23.385430Z", + "iopub.status.idle": "2024-05-16T12:13:47.966948Z", + "shell.execute_reply": "2024-05-16T12:13:47.966113Z" + } + }, + "outputs": [], + "source": [ + "df_predict = linker.predict(threshold_match_probability=0.001)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" - ] - } - ], - "source": [ - "session = linker.estimate_parameters_using_expectation_maximisation(block_on(\"amount\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:23.097922Z", - "iopub.status.busy": "2024-05-16T12:13:23.097670Z", - "iopub.status.idle": "2024-05-16T12:13:23.382589Z", - "shell.execute_reply": "2024-05-16T12:13:23.382014Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:47.970901Z", + "iopub.status.busy": "2024-05-16T12:13:47.970603Z", + "iopub.status.idle": "2024-05-16T12:13:48.365220Z", + "shell.execute_reply": "2024-05-16T12:13:48.364442Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.VConcatChart(...)" + "source": [ + "linker.comparison_viewer_dashboard(\n", + " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n", + ")\n", + "from IPython.display import IFrame\n", + "\n", + "IFrame(\n", + " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n", + ")" ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.match_weights_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:23.385651Z", - "iopub.status.busy": "2024-05-16T12:13:23.385430Z", - "iopub.status.idle": "2024-05-16T12:13:47.966948Z", - "shell.execute_reply": "2024-05-16T12:13:47.966113Z" - } - }, - "outputs": [], - "source": [ - "df_predict = linker.predict(threshold_match_probability=0.001)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:47.970901Z", - "iopub.status.busy": "2024-05-16T12:13:47.970603Z", - "iopub.status.idle": "2024-05-16T12:13:48.365220Z", - "shell.execute_reply": "2024-05-16T12:13:48.364442Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - " \n", - " " + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:48.369330Z", + "iopub.status.busy": "2024-05-16T12:13:48.369001Z", + "iopub.status.idle": "2024-05-16T12:13:54.043730Z", + "shell.execute_reply": "2024-05-16T12:13:54.043073Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "" + "source": [ + "pred_errors = linker.prediction_errors_from_labels_column(\n", + " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n", + ")\n", + "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.comparison_viewer_dashboard(\n", - " df_predict, \"dashboards/comparison_viewer_transactions.html\", overwrite=True\n", - ")\n", - "from IPython.display import IFrame\n", - "\n", - "IFrame(\n", - " src=\"./dashboards/comparison_viewer_transactions.html\", width=\"100%\", height=1200\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:48.369330Z", - "iopub.status.busy": "2024-05-16T12:13:48.369001Z", - "iopub.status.idle": "2024-05-16T12:13:54.043730Z", - "shell.execute_reply": "2024-05-16T12:13:54.043073Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:13:54.047308Z", + "iopub.status.busy": "2024-05-16T12:13:54.047030Z", + "iopub.status.idle": "2024-05-16T12:13:54.884355Z", + "shell.execute_reply": "2024-05-16T12:13:54.883814Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "pred_errors = linker.prediction_errors_from_labels_column(\n", + " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n", + ")\n", + "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" } - ], - "source": [ - "pred_errors = linker.prediction_errors_from_labels_column(\n", - " \"ground_truth\", include_false_positives=True, include_false_negatives=False\n", - ")\n", - "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:13:54.047308Z", - "iopub.status.busy": "2024-05-16T12:13:54.047030Z", - "iopub.status.idle": "2024-05-16T12:13:54.884355Z", - "shell.execute_reply": "2024-05-16T12:13:54.883814Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0cb4a943a08a42c7841ca32d466f9eed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "2bae68755fc34e38ac69e792f314ba8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "4430006dcc174ff092d96adf68c301ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "5c32bb2a7a714bd79accac15915b17e5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "6222247c7cbe45b19cfeb9b182147a18": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "63719efff46e49ecba53edb438f35c3f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "2.0.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_allow_html": false, + "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e", + "tabbable": null, + "tooltip": null, + "value": 100 + } + }, + "921bb606e07743f7a252c05830098a57": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + }, + "ed234594aea94bf98ffb67a51d3811f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "2.0.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "StyleView", + "bar_color": "black", + "description_width": "" + } + }, + "fd157120a2ca488496c737cec882713d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "auto" + } + } + }, + "version_major": 2, + "version_minor": 0 + } } - ], - "source": [ - "pred_errors = linker.prediction_errors_from_labels_column(\n", - " \"ground_truth\", include_false_positives=False, include_false_negatives=True\n", - ")\n", - "linker.waterfall_chart(pred_errors.as_record_dict(limit=5))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "0cb4a943a08a42c7841ca32d466f9eed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_fd157120a2ca488496c737cec882713d", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ed234594aea94bf98ffb67a51d3811f4", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "2bae68755fc34e38ac69e792f314ba8e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "4430006dcc174ff092d96adf68c301ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_5c32bb2a7a714bd79accac15915b17e5", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6222247c7cbe45b19cfeb9b182147a18", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "5c32bb2a7a714bd79accac15915b17e5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "6222247c7cbe45b19cfeb9b182147a18": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "63719efff46e49ecba53edb438f35c3f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "2.0.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_allow_html": false, - "layout": "IPY_MODEL_921bb606e07743f7a252c05830098a57", - "max": 100, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2bae68755fc34e38ac69e792f314ba8e", - "tabbable": null, - "tooltip": null, - "value": 100 - } - }, - "921bb606e07743f7a252c05830098a57": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - }, - "ed234594aea94bf98ffb67a51d3811f4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "2.0.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "2.0.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "StyleView", - "bar_color": "black", - "description_width": "" - } - }, - "fd157120a2ca488496c737cec882713d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "auto" - } - } - }, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb index 898f9b4af5..f27d98e341 100644 --- a/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb @@ -1,1535 +1,1535 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linking a dataset of real historical persons\n", - "\n", - "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n", - "\n", - "Note, as explained in the [backends topic guide](https://moj-analytical-services.github.io/splink/topic_guides/backends.html#sqlite), SQLite does not natively support string fuzzy matching functions such as `damareau-levenshtein` and `jaro-winkler` (as used in this example). Instead, these have been imported as python User Defined Functions (UDFs). One drawback of python UDFs is that they are considerably slower than native-SQL comparisons. As such, if you are hitting issues with large run times, consider switching to DuckDB (or some other backend).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \"Open\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:30.610213Z", - "iopub.status.busy": "2024-05-15T18:41:30.609846Z", - "iopub.status.idle": "2024-05-15T18:41:30.615335Z", - "shell.execute_reply": "2024-05-15T18:41:30.614566Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev\n", - "# !pip install rapidfuzz" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:30.619046Z", - "iopub.status.busy": "2024-05-15T18:41:30.618760Z", - "iopub.status.idle": "2024-05-15T18:41:31.933775Z", - "shell.execute_reply": "2024-05-15T18:41:31.932989Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "from splink import splink_datasets\n", - "\n", - "pd.options.display.max_rows = 1000\n", - "# reduce size of dataset to make things run faster\n", - "df = splink_datasets.historical_50k.sample(5000)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:31.938051Z", - "iopub.status.busy": "2024-05-15T18:41:31.937677Z", - "iopub.status.idle": "2024-05-15T18:41:32.856954Z", - "shell.execute_reply": "2024-05-15T18:41:32.856284Z" - } - }, - "outputs": [ + "cells": [ { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.VConcatChart(...)" + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linking a dataset of real historical persons\n", + "\n", + "In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.\n", + "\n", + "Note, as explained in the [backends topic guide](https://moj-analytical-services.github.io/splink/topic_guides/backends.html#sqlite), SQLite does not natively support string fuzzy matching functions such as `damareau-levenshtein` and `jaro-winkler` (as used in this example). Instead, these have been imported as python User Defined Functions (UDFs). One drawback of python UDFs is that they are considerably slower than native-SQL comparisons. As such, if you are hitting issues with large run times, consider switching to DuckDB (or some other backend).\n" ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import SQLiteAPI\n", - "from splink.exploratory import profile_columns\n", - "\n", - "db_api = SQLiteAPI()\n", - "profile_columns(\n", - " df, db_api, column_expressions=[\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:32.900620Z", - "iopub.status.busy": "2024-05-15T18:41:32.900280Z", - "iopub.status.idle": "2024-05-15T18:41:33.193607Z", - "shell.execute_reply": "2024-05-15T18:41:33.192963Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink import SQLiteAPI, block_on\n", - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "blocking_rules = [block_on(\"first_name\", \"surname\"),\n", - " block_on(\"surname\", \"dob\"),\n", - " block_on(\"first_name\", \"dob\"),\n", - " block_on(\"postcode_fake\", \"first_name\")]\n", - "\n", - "db_api = SQLiteAPI()\n", - "\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=df,\n", - " blocking_rule_creators=blocking_rules,\n", - " db_api=db_api,\n", - " link_type=\"dedupe_only\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:33.197015Z", - "iopub.status.busy": "2024-05-15T18:41:33.196743Z", - "iopub.status.idle": "2024-05-15T18:41:33.330331Z", - "shell.execute_reply": "2024-05-15T18:41:33.329671Z" - } - }, - "outputs": [], - "source": [ - "import splink.comparison_library as cl\n", - "import splink.comparison_template_library as ctl\n", - "from splink import Linker\n", - "\n", - "settings = {\n", - " \"link_type\": \"dedupe_only\",\n", - " \"blocking_rules_to_generate_predictions\": [\n", - " block_on(\"first_name\", \"surname\"),\n", - " block_on(\"surname\", \"dob\"),\n", - " block_on(\"first_name\", \"dob\"),\n", - " block_on(\"postcode_fake\", \"first_name\"),\n", - "\n", - " ],\n", - " \"comparisons\": [\n", - " ctl.NameComparison(\"first_name\", fuzzy_thresholds=[0.9]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " ctl.NameComparison(\"surname\", fuzzy_thresholds=[0.9]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " cl.DamerauLevenshteinAtThresholds(\"dob\", [1, 2]).configure(\n", - " term_frequency_adjustments=True\n", - " ),\n", - " cl.DamerauLevenshteinAtThresholds(\"postcode_fake\", [1, 2]),\n", - " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n", - " cl.ExactMatch(\n", - " \"occupation\",\n", - " ).configure(term_frequency_adjustments=True),\n", - " ],\n", - " \"retain_matching_columns\": True,\n", - " \"retain_intermediate_calculation_columns\": True,\n", - " \"max_iterations\": 10,\n", - " \"em_convergence\": 0.01,\n", - "}\n", - "\n", - "linker = Linker(df, settings, database_api=db_api)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:33.334300Z", - "iopub.status.busy": "2024-05-15T18:41:33.333988Z", - "iopub.status.idle": "2024-05-15T18:41:33.488238Z", - "shell.execute_reply": "2024-05-15T18:41:33.487555Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Probability two random records match is estimated to be 0.00013.\n", - "This means that amongst all possible pairwise record comparisons, one in 7,667.18 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 1,630.00 matching pairs\n" - ] - } - ], - "source": [ - "linker.estimate_probability_two_random_records_match(\n", - " [\n", - " \"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob\",\n", - " \"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)\",\n", - " \"l.dob = r.dob and l.postcode_fake = r.postcode_fake\",\n", - " ],\n", - " recall=0.6,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:33.491551Z", - "iopub.status.busy": "2024-05-15T18:41:33.491328Z", - "iopub.status.idle": "2024-05-15T18:41:41.469753Z", - "shell.execute_reply": "2024-05-15T18:41:41.469157Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Estimated u probabilities using random sampling\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - first_name (no m values are trained).\n", - " - surname (no m values are trained).\n", - " - dob (no m values are trained).\n", - " - postcode_fake (no m values are trained).\n", - " - birth_place (no m values are trained).\n", - " - occupation (no m values are trained).\n" - ] - } - ], - "source": [ - "linker.estimate_u_using_random_sampling(max_pairs=1e6)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:41.473301Z", - "iopub.status.busy": "2024-05-15T18:41:41.473009Z", - "iopub.status.idle": "2024-05-15T18:41:41.683463Z", - "shell.execute_reply": "2024-05-15T18:41:41.682843Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.first_name = r.first_name and l.surname = r.surname\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - dob\n", - " - postcode_fake\n", - " - birth_place\n", - " - occupation\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - first_name\n", - " - surname\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.435 in probability_two_random_records_match\n" - ] + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:30.610213Z", + "iopub.status.busy": "2024-05-15T18:41:30.609846Z", + "iopub.status.idle": "2024-05-15T18:41:30.615335Z", + "shell.execute_reply": "2024-05-15T18:41:30.614566Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev\n", + "# !pip install rapidfuzz" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.0222 in probability_two_random_records_match\n" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:30.619046Z", + "iopub.status.busy": "2024-05-15T18:41:30.618760Z", + "iopub.status.idle": "2024-05-15T18:41:31.933775Z", + "shell.execute_reply": "2024-05-15T18:41:31.932989Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from splink import splink_datasets\n", + "\n", + "pd.options.display.max_rows = 1000\n", + "# reduce size of dataset to make things run faster\n", + "df = splink_datasets.historical_50k.sample(5000)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was -0.00727 in the m_probability of birth_place, level `All other comparisons`\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:31.938051Z", + "iopub.status.busy": "2024-05-15T18:41:31.937677Z", + "iopub.status.idle": "2024-05-15T18:41:32.856954Z", + "shell.execute_reply": "2024-05-15T18:41:32.856284Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import SQLiteAPI\n", + "from splink.exploratory import profile_columns\n", + "\n", + "db_api = SQLiteAPI()\n", + "profile_columns(\n", + " df, db_api, column_expressions=[\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"]\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 3 iterations\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:32.900620Z", + "iopub.status.busy": "2024-05-15T18:41:32.900280Z", + "iopub.status.idle": "2024-05-15T18:41:33.193607Z", + "shell.execute_reply": "2024-05-15T18:41:33.192963Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink import SQLiteAPI, block_on\n", + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules = [block_on(\"first_name\", \"surname\"),\n", + " block_on(\"surname\", \"dob\"),\n", + " block_on(\"first_name\", \"dob\"),\n", + " block_on(\"postcode_fake\", \"first_name\")]\n", + "\n", + "db_api = SQLiteAPI()\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rules=blocking_rules,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\"\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - first_name (no m values are trained).\n", - " - surname (no m values are trained).\n" - ] - } - ], - "source": [ - "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n", - "training_session_names = linker.estimate_parameters_using_expectation_maximisation(\n", - " training_blocking_rule, estimate_without_term_frequencies=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:41.686951Z", - "iopub.status.busy": "2024-05-15T18:41:41.686683Z", - "iopub.status.idle": "2024-05-15T18:41:41.926273Z", - "shell.execute_reply": "2024-05-15T18:41:41.925689Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "----- Starting EM training session -----\n", - "\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:33.197015Z", + "iopub.status.busy": "2024-05-15T18:41:33.196743Z", + "iopub.status.idle": "2024-05-15T18:41:33.330331Z", + "shell.execute_reply": "2024-05-15T18:41:33.329671Z" + } + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "import splink.comparison_template_library as ctl\n", + "from splink import Linker\n", + "\n", + "settings = {\n", + " \"link_type\": \"dedupe_only\",\n", + " \"blocking_rules_to_generate_predictions\": [\n", + " block_on(\"first_name\", \"surname\"),\n", + " block_on(\"surname\", \"dob\"),\n", + " block_on(\"first_name\", \"dob\"),\n", + " block_on(\"postcode_fake\", \"first_name\"),\n", + "\n", + " ],\n", + " \"comparisons\": [\n", + " ctl.NameComparison(\"first_name\", fuzzy_thresholds=[0.9]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " ctl.NameComparison(\"surname\", fuzzy_thresholds=[0.9]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " cl.DamerauLevenshteinAtThresholds(\"dob\", [1, 2]).configure(\n", + " term_frequency_adjustments=True\n", + " ),\n", + " cl.DamerauLevenshteinAtThresholds(\"postcode_fake\", [1, 2]),\n", + " cl.ExactMatch(\"birth_place\").configure(term_frequency_adjustments=True),\n", + " cl.ExactMatch(\n", + " \"occupation\",\n", + " ).configure(term_frequency_adjustments=True),\n", + " ],\n", + " \"retain_matching_columns\": True,\n", + " \"retain_intermediate_calculation_columns\": True,\n", + " \"max_iterations\": 10,\n", + " \"em_convergence\": 0.01,\n", + "}\n", + "\n", + "linker = Linker(df, settings, database_api=db_api)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Estimating the m probabilities of the model by blocking on:\n", - "l.dob = r.dob\n", - "\n", - "Parameter estimates will be made for the following comparison(s):\n", - " - first_name\n", - " - surname\n", - " - postcode_fake\n", - " - birth_place\n", - " - occupation\n", - "\n", - "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - dob\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:33.334300Z", + "iopub.status.busy": "2024-05-15T18:41:33.333988Z", + "iopub.status.idle": "2024-05-15T18:41:33.488238Z", + "shell.execute_reply": "2024-05-15T18:41:33.487555Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.00013.\n", + "This means that amongst all possible pairwise record comparisons, one in 7,667.18 are expected to match. With 12,497,500 total possible comparisons, we expect a total of around 1,630.00 matching pairs\n" + ] + } + ], + "source": [ + "linker.estimate_probability_two_random_records_match(\n", + " [\n", + " \"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob\",\n", + " \"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)\",\n", + " \"l.dob = r.dob and l.postcode_fake = r.postcode_fake\",\n", + " ],\n", + " recall=0.6,\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:33.491551Z", + "iopub.status.busy": "2024-05-15T18:41:33.491328Z", + "iopub.status.idle": "2024-05-15T18:41:41.469753Z", + "shell.execute_reply": "2024-05-15T18:41:41.469157Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n", + " - dob (no m values are trained).\n", + " - postcode_fake (no m values are trained).\n", + " - birth_place (no m values are trained).\n", + " - occupation (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.estimate_u_using_random_sampling(max_pairs=1e6)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 1: Largest change in params was -0.315 in the m_probability of first_name, level `Exact match on first_name`\n" - ] + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:41.473301Z", + "iopub.status.busy": "2024-05-15T18:41:41.473009Z", + "iopub.status.idle": "2024-05-15T18:41:41.683463Z", + "shell.execute_reply": "2024-05-15T18:41:41.682843Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.first_name = r.first_name and l.surname = r.surname\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - dob\n", + " - postcode_fake\n", + " - birth_place\n", + " - occupation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - first_name\n", + " - surname\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.435 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0222 in probability_two_random_records_match\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.00727 in the m_probability of birth_place, level `All other comparisons`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 3 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - first_name (no m values are trained).\n", + " - surname (no m values are trained).\n" + ] + } + ], + "source": [ + "training_blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n", + "training_session_names = linker.estimate_parameters_using_expectation_maximisation(\n", + " training_blocking_rule, estimate_without_term_frequencies=True\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 2: Largest change in params was -0.0462 in the m_probability of first_name, level `Exact match on first_name`\n" - ] + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:41.686951Z", + "iopub.status.busy": "2024-05-15T18:41:41.686683Z", + "iopub.status.idle": "2024-05-15T18:41:41.926273Z", + "shell.execute_reply": "2024-05-15T18:41:41.925689Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Estimating the m probabilities of the model by blocking on:\n", + "l.dob = r.dob\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - first_name\n", + " - surname\n", + " - postcode_fake\n", + " - birth_place\n", + " - occupation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - dob\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 1: Largest change in params was -0.315 in the m_probability of first_name, level `Exact match on first_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 2: Largest change in params was -0.0462 in the m_probability of first_name, level `Exact match on first_name`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 3: Largest change in params was -0.0104 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 4: Largest change in params was -0.00254 in the m_probability of surname, level `Exact match on surname`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "EM converged after 4 iterations\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = \"l.dob = r.dob\"\n", + "training_session_dob = linker.estimate_parameters_using_expectation_maximisation(\n", + " training_blocking_rule, estimate_without_term_frequencies=True\n", + ")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 3: Largest change in params was -0.0104 in the m_probability of surname, level `Exact match on surname`\n" - ] + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final match weights can be viewed in the match weights chart:\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration 4: Largest change in params was -0.00254 in the m_probability of surname, level `Exact match on surname`\n" - ] + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:41.929306Z", + "iopub.status.busy": "2024-05-15T18:41:41.929078Z", + "iopub.status.idle": "2024-05-15T18:41:42.230106Z", + "shell.execute_reply": "2024-05-15T18:41:42.229484Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.match_weights_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "EM converged after 4 iterations\n" - ] + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:42.233172Z", + "iopub.status.busy": "2024-05-15T18:41:42.232933Z", + "iopub.status.idle": "2024-05-15T18:41:42.813828Z", + "shell.execute_reply": "2024-05-15T18:41:42.813043Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.unlinkables_chart()" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" - ] - } - ], - "source": [ - "training_blocking_rule = \"l.dob = r.dob\"\n", - "training_session_dob = linker.estimate_parameters_using_expectation_maximisation(\n", - " training_blocking_rule, estimate_without_term_frequencies=True\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The final match weights can be viewed in the match weights chart:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:41.929306Z", - "iopub.status.busy": "2024-05-15T18:41:41.929078Z", - "iopub.status.idle": "2024-05-15T18:41:42.230106Z", - "shell.execute_reply": "2024-05-15T18:41:42.229484Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:42.817975Z", + "iopub.status.busy": "2024-05-15T18:41:42.817397Z", + "iopub.status.idle": "2024-05-15T18:41:43.292311Z", + "shell.execute_reply": "2024-05-15T18:41:43.291620Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...bf_birth_placebf_tf_adj_birth_placeoccupation_loccupation_rgamma_occupationtf_occupation_ltf_occupation_rbf_occupationbf_tf_adj_occupationmatch_key
030.7379211.000000Q6139106-4Q6139106-5jamesjames20.0262260.02622643.192839...125.2665812.594855NoneNone-1NaNNaN1.0000001.0000000
1-10.7284430.000589Q5545144-4Q608545-9georgegeorge20.0320320.03203243.192839...0.1732831.000000rugby union playerentomologist00.0134560.0019220.0844151.0000000
235.6066361.000000Q4888351-1Q4888351-2benjaminbenjamin20.0032030.00320343.192839...125.2665814.757234rugby union playerrugby union player10.0134560.01345624.4444132.7931460
335.6066361.000000Q4888351-1Q4888351-4benjaminbenjamin20.0032030.00320343.192839...125.2665814.757234rugby union playerrugby union player10.0134560.01345624.4444132.7931460
436.8668101.000000Q1293322-1Q1293322-4edwardedward20.0184180.01841843.192839...125.2665812.195647priestpriest10.0246060.02460624.4444131.5275020
\n", + "

5 rows × 44 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", + "0 30.737921 1.000000 Q6139106-4 Q6139106-5 james \n", + "1 -10.728443 0.000589 Q5545144-4 Q608545-9 george \n", + "2 35.606636 1.000000 Q4888351-1 Q4888351-2 benjamin \n", + "3 35.606636 1.000000 Q4888351-1 Q4888351-4 benjamin \n", + "4 36.866810 1.000000 Q1293322-1 Q1293322-4 edward \n", + "\n", + " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", + "0 james 2 0.026226 0.026226 \n", + "1 george 2 0.032032 0.032032 \n", + "2 benjamin 2 0.003203 0.003203 \n", + "3 benjamin 2 0.003203 0.003203 \n", + "4 edward 2 0.018418 0.018418 \n", + "\n", + " bf_first_name ... bf_birth_place bf_tf_adj_birth_place \\\n", + "0 43.192839 ... 125.266581 2.594855 \n", + "1 43.192839 ... 0.173283 1.000000 \n", + "2 43.192839 ... 125.266581 4.757234 \n", + "3 43.192839 ... 125.266581 4.757234 \n", + "4 43.192839 ... 125.266581 2.195647 \n", + "\n", + " occupation_l occupation_r gamma_occupation tf_occupation_l \\\n", + "0 None None -1 NaN \n", + "1 rugby union player entomologist 0 0.013456 \n", + "2 rugby union player rugby union player 1 0.013456 \n", + "3 rugby union player rugby union player 1 0.013456 \n", + "4 priest priest 1 0.024606 \n", + "\n", + " tf_occupation_r bf_occupation bf_tf_adj_occupation match_key \n", + "0 NaN 1.000000 1.000000 0 \n", + "1 0.001922 0.084415 1.000000 0 \n", + "2 0.013456 24.444413 2.793146 0 \n", + "3 0.013456 24.444413 2.793146 0 \n", + "4 0.024606 24.444413 1.527502 0 \n", + "\n", + "[5 rows x 44 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.VConcatChart(...)" + "source": [ + "df_predict = linker.predict()\n", + "df_e = df_predict.as_pandas_dataframe(limit=5)\n", + "df_e" ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.match_weights_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:42.233172Z", - "iopub.status.busy": "2024-05-15T18:41:42.232933Z", - "iopub.status.idle": "2024-05-15T18:41:42.813828Z", - "shell.execute_reply": "2024-05-15T18:41:42.813043Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also view rows in this dataset as a waterfall chart as follows:\n" ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.unlinkables_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:42.817975Z", - "iopub.status.busy": "2024-05-15T18:41:42.817397Z", - "iopub.status.idle": "2024-05-15T18:41:43.292311Z", - "shell.execute_reply": "2024-05-15T18:41:43.291620Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_nametf_first_name_ltf_first_name_rbf_first_name...bf_birth_placebf_tf_adj_birth_placeoccupation_loccupation_rgamma_occupationtf_occupation_ltf_occupation_rbf_occupationbf_tf_adj_occupationmatch_key
030.7379211.000000Q6139106-4Q6139106-5jamesjames20.0262260.02622643.192839...125.2665812.594855NoneNone-1NaNNaN1.0000001.0000000
1-10.7284430.000589Q5545144-4Q608545-9georgegeorge20.0320320.03203243.192839...0.1732831.000000rugby union playerentomologist00.0134560.0019220.0844151.0000000
235.6066361.000000Q4888351-1Q4888351-2benjaminbenjamin20.0032030.00320343.192839...125.2665814.757234rugby union playerrugby union player10.0134560.01345624.4444132.7931460
335.6066361.000000Q4888351-1Q4888351-4benjaminbenjamin20.0032030.00320343.192839...125.2665814.757234rugby union playerrugby union player10.0134560.01345624.4444132.7931460
436.8668101.000000Q1293322-1Q1293322-4edwardedward20.0184180.01841843.192839...125.2665812.195647priestpriest10.0246060.02460624.4444131.5275020
\n", - "

5 rows × 44 columns

\n", - "
" + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:43.296030Z", + "iopub.status.busy": "2024-05-15T18:41:43.295753Z", + "iopub.status.idle": "2024-05-15T18:41:43.969119Z", + "shell.execute_reply": "2024-05-15T18:41:43.968521Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " match_weight match_probability unique_id_l unique_id_r first_name_l \\\n", - "0 30.737921 1.000000 Q6139106-4 Q6139106-5 james \n", - "1 -10.728443 0.000589 Q5545144-4 Q608545-9 george \n", - "2 35.606636 1.000000 Q4888351-1 Q4888351-2 benjamin \n", - "3 35.606636 1.000000 Q4888351-1 Q4888351-4 benjamin \n", - "4 36.866810 1.000000 Q1293322-1 Q1293322-4 edward \n", - "\n", - " first_name_r gamma_first_name tf_first_name_l tf_first_name_r \\\n", - "0 james 2 0.026226 0.026226 \n", - "1 george 2 0.032032 0.032032 \n", - "2 benjamin 2 0.003203 0.003203 \n", - "3 benjamin 2 0.003203 0.003203 \n", - "4 edward 2 0.018418 0.018418 \n", - "\n", - " bf_first_name ... bf_birth_place bf_tf_adj_birth_place \\\n", - "0 43.192839 ... 125.266581 2.594855 \n", - "1 43.192839 ... 0.173283 1.000000 \n", - "2 43.192839 ... 125.266581 4.757234 \n", - "3 43.192839 ... 125.266581 4.757234 \n", - "4 43.192839 ... 125.266581 2.195647 \n", - "\n", - " occupation_l occupation_r gamma_occupation tf_occupation_l \\\n", - "0 None None -1 NaN \n", - "1 rugby union player entomologist 0 0.013456 \n", - "2 rugby union player rugby union player 1 0.013456 \n", - "3 rugby union player rugby union player 1 0.013456 \n", - "4 priest priest 1 0.024606 \n", - "\n", - " tf_occupation_r bf_occupation bf_tf_adj_occupation match_key \n", - "0 NaN 1.000000 1.000000 0 \n", - "1 0.001922 0.084415 1.000000 0 \n", - "2 0.013456 24.444413 2.793146 0 \n", - "3 0.013456 24.444413 2.793146 0 \n", - "4 0.024606 24.444413 1.527502 0 \n", - "\n", - "[5 rows x 44 columns]" + "source": [ + "\n", + "records_to_plot = df_e.to_dict(orient=\"records\")\n", + "linker.waterfall_chart(records_to_plot, filter_nulls=False)" ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_predict = linker.predict()\n", - "df_e = df_predict.as_pandas_dataframe(limit=5)\n", - "df_e" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also view rows in this dataset as a waterfall chart as follows:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:43.296030Z", - "iopub.status.busy": "2024-05-15T18:41:43.295753Z", - "iopub.status.idle": "2024-05-15T18:41:43.969119Z", - "shell.execute_reply": "2024-05-15T18:41:43.968521Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:43.972219Z", + "iopub.status.busy": "2024-05-15T18:41:43.971787Z", + "iopub.status.idle": "2024-05-15T18:41:44.116709Z", + "shell.execute_reply": "2024-05-15T18:41:44.115993Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 1, root rows count 4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Completed iteration 2, root rows count 0\n" + ] + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "clusters = linker.cluster_pairwise_predictions_at_threshold(\n", + " df_predict, threshold_match_probability=0.95\n", + ")" ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "records_to_plot = df_e.to_dict(orient=\"records\")\n", - "linker.waterfall_chart(records_to_plot, filter_nulls=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:43.972219Z", - "iopub.status.busy": "2024-05-15T18:41:43.971787Z", - "iopub.status.idle": "2024-05-15T18:41:44.116709Z", - "shell.execute_reply": "2024-05-15T18:41:44.115993Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 1, root rows count 4\n" - ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Completed iteration 2, root rows count 0\n" - ] - } - ], - "source": [ - "clusters = linker.cluster_pairwise_predictions_at_threshold(\n", - " df_predict, threshold_match_probability=0.95\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:44.120162Z", - "iopub.status.busy": "2024-05-15T18:41:44.119922Z", - "iopub.status.idle": "2024-05-15T18:41:44.180152Z", - "shell.execute_reply": "2024-05-15T18:41:44.179445Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:44.120162Z", + "iopub.status.busy": "2024-05-15T18:41:44.119922Z", + "iopub.status.idle": "2024-05-15T18:41:44.180152Z", + "shell.execute_reply": "2024-05-15T18:41:44.179445Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "" + "source": [ + "linker.cluster_studio_dashboard(\n", + " df_predict,\n", + " clusters,\n", + " \"dashboards/50k_cluster.html\",\n", + " sampling_method=\"by_cluster_size\",\n", + " overwrite=True,\n", + ")\n", + "\n", + "from IPython.display import IFrame\n", + "\n", + "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.cluster_studio_dashboard(\n", - " df_predict,\n", - " clusters,\n", - " \"dashboards/50k_cluster.html\",\n", - " sampling_method=\"by_cluster_size\",\n", - " overwrite=True,\n", - ")\n", - "\n", - "from IPython.display import IFrame\n", - "\n", - "IFrame(src=\"./dashboards/50k_cluster.html\", width=\"100%\", height=1200)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:44.184020Z", - "iopub.status.busy": "2024-05-15T18:41:44.183710Z", - "iopub.status.idle": "2024-05-15T18:41:46.543532Z", - "shell.execute_reply": "2024-05-15T18:41:46.542614Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:44.184020Z", + "iopub.status.busy": "2024-05-15T18:41:44.183710Z", + "iopub.status.idle": "2024-05-15T18:41:46.543532Z", + "shell.execute_reply": "2024-05-15T18:41:46.542614Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.Chart(...)" + "source": [ + "linker.roc_chart_from_labels_column(\"cluster\", match_weight_round_to_nearest=0.02)" ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linker.roc_chart_from_labels_column(\"cluster\", match_weight_round_to_nearest=0.02)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:46.557696Z", - "iopub.status.busy": "2024-05-15T18:41:46.557395Z", - "iopub.status.idle": "2024-05-15T18:41:47.295019Z", - "shell.execute_reply": "2024-05-15T18:41:47.294474Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:46.557696Z", + "iopub.status.busy": "2024-05-15T18:41:46.557395Z", + "iopub.status.idle": "2024-05-15T18:41:47.295019Z", + "shell.execute_reply": "2024-05-15T18:41:47.294474Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "records = linker.prediction_errors_from_labels_column(\n", + " \"cluster\",\n", + " threshold=0.999,\n", + " include_false_negatives=False,\n", + " include_false_positives=True,\n", + ").as_record_dict()\n", + "linker.waterfall_chart(records)" ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "records = linker.prediction_errors_from_labels_column(\n", - " \"cluster\",\n", - " threshold=0.999,\n", - " include_false_negatives=False,\n", - " include_false_positives=True,\n", - ").as_record_dict()\n", - "linker.waterfall_chart(records)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-15T18:41:47.298555Z", - "iopub.status.busy": "2024-05-15T18:41:47.298310Z", - "iopub.status.idle": "2024-05-15T18:41:50.039196Z", - "shell.execute_reply": "2024-05-15T18:41:50.038400Z" - } - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-15T18:41:47.298555Z", + "iopub.status.busy": "2024-05-15T18:41:47.298310Z", + "iopub.status.idle": "2024-05-15T18:41:50.039196Z", + "shell.execute_reply": "2024-05-15T18:41:50.038400Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.LayerChart(...)" + "source": [ + "# Some of the false negatives will be because they weren't detected by the blocking rules\n", + "records = linker.prediction_errors_from_labels_column(\n", + " \"cluster\",\n", + " threshold=0.5,\n", + " include_false_negatives=True,\n", + " include_false_positives=False,\n", + ").as_record_dict(limit=50)\n", + "\n", + "linker.waterfall_chart(records)" ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" } - ], - "source": [ - "# Some of the false negatives will be because they weren't detected by the blocking rules\n", - "records = linker.prediction_errors_from_labels_column(\n", - " \"cluster\",\n", - " threshold=0.5,\n", - " include_false_negatives=True,\n", - " include_false_positives=False,\n", - ").as_record_dict(limit=50)\n", - "\n", - "linker.waterfall_chart(records)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/docs/demos/tutorials/03_Blocking.ipynb b/docs/demos/tutorials/03_Blocking.ipynb index 18b22638bd..20f2ab0243 100644 --- a/docs/demos/tutorials/03_Blocking.ipynb +++ b/docs/demos/tutorials/03_Blocking.ipynb @@ -1,516 +1,516 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Choosing blocking rules to optimise runtime\n", - "\n", - "\n", - " \"Open\n", - "\n", - "\n", - "To link records, we need to compare pairs of records, and decide which pairs are matches and non matches.\n", - "\n", - "For example consider the following two records:\n", - "\n", - "| first_name | surname | dob | city | email |\n", - "| ---------- | ------- | ---------- | ------ | ------------------- |\n", - "| Robert | Allen | 1971-05-24 | nan | roberta25@smith.net |\n", - "| Rob | Allen | 1971-06-24 | London | roberta25@smith.net |\n", - "\n", - "These can be represented as a pairwise comparison as follows:\n", - "\n", - "| first_name_l | first_name_r | surname_l | surname_r | dob_l | dob_r | city_l | city_r | email_l | email_r |\n", - "| ------------ | ------------ | --------- | --------- | ---------- | ---------- | ------ | ------ | ------------------- | ------------------- |\n", - "| Robert | Rob | Allen | Allen | 1971-05-24 | 1971-06-24 | nan | London | roberta25@smith.net | roberta25@smith.net |\n", - "\n", - "For most large datasets, it is computationally intractable to compare every row with every other row, since the number of comparisons rises quadratically with the number of records.\n", - "\n", - "Instead we rely on blocking rules, which specify which pairwise comparisons to generate. For example, we could generate the subset of pairwise comparisons where either first name or surname matches.\n", - "\n", - "This is part of a two step process to link data:\n", - "\n", - "1. Use blocking rules to generate candidate pairwise record comparisons\n", - "\n", - "2. Use a probabilistic linkage model to score these candidate pairs, to determine which ones should be linked\n", - "\n", - "**Blocking rules are the most important determinant of the performance of your linkage job**.\n", - "\n", - "When deciding on your blocking rules, you're trading off accuracy for performance:\n", - "\n", - "- If your rules are too loose, your linkage job may fail.\n", - "- If they're too tight, you may miss some valid links.\n", - "\n", - "This tutorial clarifies what blocking rules are, and how to choose good rules.\n", - "\n", - "## Blocking rules in Splink\n", - "\n", - "In Splink, blocking rules are specified as SQL expressions.\n", - "\n", - "For example, to generate the subset of record comparisons where the first name and surname matches, we can specify the following blocking rule:\n", - "\n", - "```python\n", - "from splink.blocking_rule_library import block_on\n", - "block_on(\"first_name\", \"surname\")\n", - "```\n", - "\n", - "When executed, this blocking rule will be converted to a SQL statement with the following form:\n", - "\n", - "```sql\n", - "SELECT ...\n", - "FROM input_tables as l\n", - "INNER jOIN input_tables as r\n", - "ON l.first_name = r.first_name AND l.surname = r.surname\n", - "```\n", - "\n", - "Since blocking rules are SQL expressions, they can be arbitrarily complex. For example, you could create record comparisons where the initial of the first name and the surname match with the following rule:\n", - "\n", - "```python\n", - "from splink.blocking_rule_library import block_on\n", - "block_on(\"substr(first_name, 1, 2)\", \"surname\")\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Devising effective blocking rules for prediction\n", - "\n", - "The aims of your blocking rules are twofold:\n", - "\n", - "1. Eliminate enough non-matching comparison pairs so your record linkage job is small enough to compute\n", - "2. Eliminate as few truly matching pairs as possible (ideally none)\n", - "\n", - "It is usually impossible to find a single blocking rule which achieves both aims, so we recommend using multiple blocking rules.\n", - "\n", - "When we specify multiple blocking rules, Splink will generate all comparison pairs that meet any one of the rules.\n", - "\n", - "For example, consider the following blocking rule:\n", - "\n", - "`block_on(\"first_name\", \"dob\")`\n", - "\n", - "This rule is likely to be effective in reducing the number of comparison pairs. It will retain all truly matching pairs, except those with errors or nulls in either the `first_name` or `dob` fields.\n", - "\n", - "Now consider a second blocking rule:\n", - "\n", - "`block_on(\"email\")`.\n", - "\n", - "This will retain all truly matching pairs, except those with errors or nulls in the `email` column.\n", - "\n", - "Individually, these blocking rules are problematic because they exclude true matches where the records contain typos of certain types. But between them, they might do quite a good job.\n", - "\n", - "For a true match to be eliminated by the use of these two blocking rules, it would have to have an error in _both_ `email` AND (`first_name` or `dob`).\n", - "\n", - "This is not completely implausible, but it is significantly less likely than if we'd used a single rule.\n", - "\n", - "More generally, we can often specify multiple blocking rules such that it becomes highly implausible that a true match would not meet at least one of these blocking criteria. This is the recommended approach in Splink. Generally we would recommend between about 3 and 10, though even more is possible.\n", - "\n", - "The question then becomes how to choose what to put in this list.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Splink tools to help choose your blocking rules\n", - "\n", - "Splink contains a number of tools to help you choose effective blocking rules. Let's try them out, using our small test dataset:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:14:10.776394Z", - "iopub.status.busy": "2024-05-16T12:14:10.776043Z", - "iopub.status.idle": "2024-05-16T12:14:10.781556Z", - "shell.execute_reply": "2024-05-16T12:14:10.780845Z" - } - }, - "outputs": [], - "source": [ - "# Uncomment and run this cell if you're running in Google Colab.\n", - "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:14:10.785735Z", - "iopub.status.busy": "2024-05-16T12:14:10.785460Z", - "iopub.status.idle": "2024-05-16T12:14:12.763325Z", - "shell.execute_reply": "2024-05-16T12:14:12.762406Z" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Choosing blocking rules to optimise runtime\n", + "\n", + "\n", + " \"Open\n", + "\n", + "\n", + "To link records, we need to compare pairs of records, and decide which pairs are matches and non matches.\n", + "\n", + "For example consider the following two records:\n", + "\n", + "| first_name | surname | dob | city | email |\n", + "| ---------- | ------- | ---------- | ------ | ------------------- |\n", + "| Robert | Allen | 1971-05-24 | nan | roberta25@smith.net |\n", + "| Rob | Allen | 1971-06-24 | London | roberta25@smith.net |\n", + "\n", + "These can be represented as a pairwise comparison as follows:\n", + "\n", + "| first_name_l | first_name_r | surname_l | surname_r | dob_l | dob_r | city_l | city_r | email_l | email_r |\n", + "| ------------ | ------------ | --------- | --------- | ---------- | ---------- | ------ | ------ | ------------------- | ------------------- |\n", + "| Robert | Rob | Allen | Allen | 1971-05-24 | 1971-06-24 | nan | London | roberta25@smith.net | roberta25@smith.net |\n", + "\n", + "For most large datasets, it is computationally intractable to compare every row with every other row, since the number of comparisons rises quadratically with the number of records.\n", + "\n", + "Instead we rely on blocking rules, which specify which pairwise comparisons to generate. For example, we could generate the subset of pairwise comparisons where either first name or surname matches.\n", + "\n", + "This is part of a two step process to link data:\n", + "\n", + "1. Use blocking rules to generate candidate pairwise record comparisons\n", + "\n", + "2. Use a probabilistic linkage model to score these candidate pairs, to determine which ones should be linked\n", + "\n", + "**Blocking rules are the most important determinant of the performance of your linkage job**.\n", + "\n", + "When deciding on your blocking rules, you're trading off accuracy for performance:\n", + "\n", + "- If your rules are too loose, your linkage job may fail.\n", + "- If they're too tight, you may miss some valid links.\n", + "\n", + "This tutorial clarifies what blocking rules are, and how to choose good rules.\n", + "\n", + "## Blocking rules in Splink\n", + "\n", + "In Splink, blocking rules are specified as SQL expressions.\n", + "\n", + "For example, to generate the subset of record comparisons where the first name and surname matches, we can specify the following blocking rule:\n", + "\n", + "```python\n", + "from splink.blocking_rule_library import block_on\n", + "block_on(\"first_name\", \"surname\")\n", + "```\n", + "\n", + "When executed, this blocking rule will be converted to a SQL statement with the following form:\n", + "\n", + "```sql\n", + "SELECT ...\n", + "FROM input_tables as l\n", + "INNER jOIN input_tables as r\n", + "ON l.first_name = r.first_name AND l.surname = r.surname\n", + "```\n", + "\n", + "Since blocking rules are SQL expressions, they can be arbitrarily complex. For example, you could create record comparisons where the initial of the first name and the surname match with the following rule:\n", + "\n", + "```python\n", + "from splink.blocking_rule_library import block_on\n", + "block_on(\"substr(first_name, 1, 2)\", \"surname\")\n", + "```\n" + ] }, - "tags": [] - }, - "outputs": [], - "source": [ - "from splink import Linker, DuckDBAPI, block_on, SettingsCreator, splink_datasets\n", - "import altair as alt\n", - "\n", - "df = splink_datasets.fake_1000" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Counting the number of comparisons created by a single blocking rule\n", - "\n", - "On large datasets, some blocking rules imply the creation of trillions of record comparisons, which would cause a linkage job to fail.\n", - "\n", - "Before using a blocking rule in a linkage job, it's therefore a good idea to count the number of records it generates to ensure it is not too loose:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:14:12.767657Z", - "iopub.status.busy": "2024-05-16T12:14:12.767348Z", - "iopub.status.idle": "2024-05-16T12:14:13.144051Z", - "shell.execute_reply": "2024-05-16T12:14:13.143363Z" + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Devising effective blocking rules for prediction\n", + "\n", + "The aims of your blocking rules are twofold:\n", + "\n", + "1. Eliminate enough non-matching comparison pairs so your record linkage job is small enough to compute\n", + "2. Eliminate as few truly matching pairs as possible (ideally none)\n", + "\n", + "It is usually impossible to find a single blocking rule which achieves both aims, so we recommend using multiple blocking rules.\n", + "\n", + "When we specify multiple blocking rules, Splink will generate all comparison pairs that meet any one of the rules.\n", + "\n", + "For example, consider the following blocking rule:\n", + "\n", + "`block_on(\"first_name\", \"dob\")`\n", + "\n", + "This rule is likely to be effective in reducing the number of comparison pairs. It will retain all truly matching pairs, except those with errors or nulls in either the `first_name` or `dob` fields.\n", + "\n", + "Now consider a second blocking rule:\n", + "\n", + "`block_on(\"email\")`.\n", + "\n", + "This will retain all truly matching pairs, except those with errors or nulls in the `email` column.\n", + "\n", + "Individually, these blocking rules are problematic because they exclude true matches where the records contain typos of certain types. But between them, they might do quite a good job.\n", + "\n", + "For a true match to be eliminated by the use of these two blocking rules, it would have to have an error in _both_ `email` AND (`first_name` or `dob`).\n", + "\n", + "This is not completely implausible, but it is significantly less likely than if we'd used a single rule.\n", + "\n", + "More generally, we can often specify multiple blocking rules such that it becomes highly implausible that a true match would not meet at least one of these blocking criteria. This is the recommended approach in Splink. Generally we would recommend between about 3 and 10, though even more is possible.\n", + "\n", + "The question then becomes how to choose what to put in this list.\n" + ] }, - "tags": [] - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "---\n", - "{'number_of_comparisons_generated_pre_filter_conditions': 1632, 'number_of_comparisons_to_be_scored_post_filter_conditions': 473, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'SUBSTR(l.first_name, 1, 1) = SUBSTR(r.first_name, 1, 1) AND l.\"surname\" = r.\"surname\"'}\n", - "---\n", - "{'number_of_comparisons_generated_pre_filter_conditions': 4095, 'number_of_comparisons_to_be_scored_post_filter_conditions': 1638, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"surname\" = r.\"surname\"'}\n", - "---\n", - "{'number_of_comparisons_generated_pre_filter_conditions': 2153, 'number_of_comparisons_to_be_scored_post_filter_conditions': 682, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"email\" = r.\"email\"'}\n", - "---\n", - "{'number_of_comparisons_generated_pre_filter_conditions': 1304, 'number_of_comparisons_to_be_scored_post_filter_conditions': 315, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"city\" = r.\"city\" AND l.\"first_name\" = r.\"first_name\"'}\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Splink tools to help choose your blocking rules\n", + "\n", + "Splink contains a number of tools to help you choose effective blocking rules. Let's try them out, using our small test dataset:\n" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "---\n", - "{'number_of_comparisons_generated_pre_filter_conditions': 4827, 'number_of_comparisons_to_be_scored_post_filter_conditions': 372, 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2', 'equi_join_conditions_identified': 'l.first_name = r.first_name'}\n" - ] - } - ], - "source": [ - "from splink.blocking_analysis import count_comparisons_from_blocking_rule\n", - "\n", - "db_api = DuckDBAPI()\n", - "blocking_rules_for_analysis = [\n", - " block_on(\"substr(first_name, 1,1)\", \"surname\"),\n", - " block_on(\"surname\"),\n", - " block_on(\"email\"),\n", - " block_on(\"city\", \"first_name\"),\n", - " \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\",\n", - "]\n", - "\n", - "\n", - "for br in blocking_rules_for_analysis:\n", - " counts = count_comparisons_from_blocking_rule(\n", - " table_or_tables=df,\n", - " blocking_rule_creator=br,\n", - " link_type=\"dedupe_only\",\n", - " db_api=db_api,\n", - " )\n", - " print(\"---\")\n", - " print(counts)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The maximum number of comparisons that you can compute will be affected by your choice of SQL backend, and how powerful your computer is.\n", - "\n", - "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons. For Spark and Athena, try starting with fewer than a a billion comparisons, before scaling up.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Counting the number of comparisons created by a list of blocking rules\n", - "\n", - "As noted above, it's usually a good idea to use multiple blocking rules. It's therefore useful to know how many record comparisons will be generated when these rules are applied.\n", - "\n", - "Since the same record comparison may be created by several blocking rules, and Splink automatically deduplicates these comparisons, we cannot simply total the number of comparisons generated by each rule individually.\n", - "\n", - "Splink provides a chart that shows the marginal (additional) comparisons generated by each blocking rule, after deduplication:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:14:13.150066Z", - "iopub.status.busy": "2024-05-16T12:14:13.149747Z", - "iopub.status.idle": "2024-05-16T12:14:13.396698Z", - "shell.execute_reply": "2024-05-16T12:14:13.395979Z" + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:14:10.776394Z", + "iopub.status.busy": "2024-05-16T12:14:10.776043Z", + "iopub.status.idle": "2024-05-16T12:14:10.781556Z", + "shell.execute_reply": "2024-05-16T12:14:10.780845Z" + } + }, + "outputs": [], + "source": [ + "# Uncomment and run this cell if you're running in Google Colab.\n", + "# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:14:10.785735Z", + "iopub.status.busy": "2024-05-16T12:14:10.785460Z", + "iopub.status.idle": "2024-05-16T12:14:12.763325Z", + "shell.execute_reply": "2024-05-16T12:14:12.762406Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from splink import Linker, DuckDBAPI, block_on, SettingsCreator, splink_datasets\n", + "import altair as alt\n", + "\n", + "df = splink_datasets.fake_1000" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Counting the number of comparisons created by a single blocking rule\n", + "\n", + "On large datasets, some blocking rules imply the creation of trillions of record comparisons, which would cause a linkage job to fail.\n", + "\n", + "Before using a blocking rule in a linkage job, it's therefore a good idea to count the number of records it generates to ensure it is not too loose:\n" + ] }, - "tags": [] - }, - "outputs": [ { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:14:12.767657Z", + "iopub.status.busy": "2024-05-16T12:14:12.767348Z", + "iopub.status.idle": "2024-05-16T12:14:13.144051Z", + "shell.execute_reply": "2024-05-16T12:14:13.143363Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 1632, 'number_of_comparisons_to_be_scored_post_filter_conditions': 473, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'SUBSTR(l.first_name, 1, 1) = SUBSTR(r.first_name, 1, 1) AND l.\"surname\" = r.\"surname\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 4095, 'number_of_comparisons_to_be_scored_post_filter_conditions': 1638, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"surname\" = r.\"surname\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 2153, 'number_of_comparisons_to_be_scored_post_filter_conditions': 682, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"email\" = r.\"email\"'}\n", + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 1304, 'number_of_comparisons_to_be_scored_post_filter_conditions': 315, 'filter_conditions_identified': '', 'equi_join_conditions_identified': 'l.\"city\" = r.\"city\" AND l.\"first_name\" = r.\"first_name\"'}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 4827, 'number_of_comparisons_to_be_scored_post_filter_conditions': 372, 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2', 'equi_join_conditions_identified': 'l.first_name = r.first_name'}\n" + ] + } ], - "text/plain": [ - "alt.Chart(...)" + "source": [ + "from splink.blocking_analysis import count_comparisons_from_blocking_rule\n", + "\n", + "db_api = DuckDBAPI()\n", + "blocking_rules_for_analysis = [\n", + " block_on(\"substr(first_name, 1,1)\", \"surname\"),\n", + " block_on(\"surname\"),\n", + " block_on(\"email\"),\n", + " block_on(\"city\", \"first_name\"),\n", + " \"l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2\",\n", + "]\n", + "\n", + "\n", + "for br in blocking_rules_for_analysis:\n", + " counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=df,\n", + " blocking_rule_creator=br,\n", + " link_type=\"dedupe_only\",\n", + " db_api=db_api,\n", + " )\n", + " print(\"---\")\n", + " print(counts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The maximum number of comparisons that you can compute will be affected by your choice of SQL backend, and how powerful your computer is.\n", + "\n", + "For linkages in DuckDB on a standard laptop, we suggest using blocking rules that create no more than about 20 million comparisons. For Spark and Athena, try starting with fewer than a a billion comparisons, before scaling up.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Counting the number of comparisons created by a list of blocking rules\n", + "\n", + "As noted above, it's usually a good idea to use multiple blocking rules. It's therefore useful to know how many record comparisons will be generated when these rules are applied.\n", + "\n", + "Since the same record comparison may be created by several blocking rules, and Splink automatically deduplicates these comparisons, we cannot simply total the number of comparisons generated by each rule individually.\n", + "\n", + "Splink provides a chart that shows the marginal (additional) comparisons generated by each blocking rule, after deduplication:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:14:13.150066Z", + "iopub.status.busy": "2024-05-16T12:14:13.149747Z", + "iopub.status.idle": "2024-05-16T12:14:13.396698Z", + "shell.execute_reply": "2024-05-16T12:14:13.395979Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=df,\n", + " blocking_rules=blocking_rules_for_analysis,\n", + " db_api=db_api,\n", + " link_type=\"dedupe_only\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Understanding why certain blocking rules create large numbers of comparisons\n", + "\n", + "Finally, we can use the `profile_columns` function we saw in the previous tutorial to understand a specific blocking rule in more depth.\n", + "\n", + "Suppose we're interested in blocking on city and first initial.\n", + "\n", + "Within each distinct value of `(city, first initial)`, all possible pairwise comparisons will be generated.\n", + "\n", + "So for instance, if there are 15 distinct records with `London,J` then these records will result in `n(n-1)/2 = 105` pairwise comparisons being generated.\n", + "\n", + "In a larger dataset, we might observe 10,000 `London,J` records, which would then be responsible for `49,995,000` comparisons.\n", + "\n", + "These high-frequency values therefore have a disproportionate influence on the overall number of pairwise comparisons, and so it can be useful to analyse skew, as follows:\n" ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from splink.blocking_analysis import (\n", - " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", - ")\n", - "\n", - "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", - " table_or_tables=df,\n", - " blocking_rule_creators=blocking_rules_for_analysis,\n", - " db_api=db_api,\n", - " link_type=\"dedupe_only\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Understanding why certain blocking rules create large numbers of comparisons\n", - "\n", - "Finally, we can use the `profile_columns` function we saw in the previous tutorial to understand a specific blocking rule in more depth.\n", - "\n", - "Suppose we're interested in blocking on city and first initial.\n", - "\n", - "Within each distinct value of `(city, first initial)`, all possible pairwise comparisons will be generated.\n", - "\n", - "So for instance, if there are 15 distinct records with `London,J` then these records will result in `n(n-1)/2 = 105` pairwise comparisons being generated.\n", - "\n", - "In a larger dataset, we might observe 10,000 `London,J` records, which would then be responsible for `49,995,000` comparisons.\n", - "\n", - "These high-frequency values therefore have a disproportionate influence on the overall number of pairwise comparisons, and so it can be useful to analyse skew, as follows:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-05-16T12:14:13.400314Z", - "iopub.status.busy": "2024-05-16T12:14:13.400025Z", - "iopub.status.idle": "2024-05-16T12:14:13.595444Z", - "shell.execute_reply": "2024-05-16T12:14:13.594906Z" }, - "tags": [] - }, - "outputs": [ { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-16T12:14:13.400314Z", + "iopub.status.busy": "2024-05-16T12:14:13.400025Z", + "iopub.status.idle": "2024-05-16T12:14:13.595444Z", + "shell.execute_reply": "2024-05-16T12:14:13.594906Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "alt.VConcatChart(...)" + "source": [ + "from splink.exploratory import profile_columns\n", + "profile_columns(df, column_expressions=[\"city || left(first_name,1)\"], db_api=db_api)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "!!! note \"Further Reading\"\n", + ":simple-readme: For a deeper dive on blocking, please refer to the [Blocking Topic Guides](../../topic_guides/blocking/blocking_rules.md).\n", + "\n", + " :material-tools: For more on the blocking tools in Splink, please refer to the [Blocking API documentation](../../linkerbloc.md).\n", + "\n", + " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#blocking).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "Now we have chosen which records to compare, we can use those records to train a linkage model.\n" ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" } - ], - "source": [ - "from splink.exploratory import profile_columns\n", - "profile_columns(df, column_expressions=[\"city || left(first_name,1)\"], db_api=db_api)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "!!! note \"Further Reading\"\n", - ":simple-readme: For a deeper dive on blocking, please refer to the [Blocking Topic Guides](../../topic_guides/blocking/blocking_rules.md).\n", - "\n", - " :material-tools: For more on the blocking tools in Splink, please refer to the [Blocking API documentation](../../linkerbloc.md).\n", - "\n", - " :bar_chart: For more on the charts used in this tutorial, please refer to the [Charts Gallery](../../charts/index.md#blocking).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Next steps\n", - "\n", - "Now we have chosen which records to compare, we can use those records to train a linkage model.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index e9312be7c0..b5cf55cb02 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -559,7 +559,7 @@ def add_l_r(sql, table_name): def count_comparisons_from_blocking_rule( *, table_or_tables: Sequence[AcceptableInputTableType], - blocking_rule_creator: Union[BlockingRuleCreator, str, Dict[str, Any]], + blocking_rule: Union[BlockingRuleCreator, str, Dict[str, Any]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str = "unique_id", @@ -567,15 +567,16 @@ def count_comparisons_from_blocking_rule( compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), ) -> dict[str, Union[int, str]]: - blocking_rule_creator_as_creator = to_blocking_rule_creator( - blocking_rule_creator - ).get_blocking_rule(db_api.sql_dialect.name) + # Ensure what's been passed in is a BlockingRuleCreator + blocking_rule_creator = to_blocking_rule_creator(blocking_rule).get_blocking_rule( + db_api.sql_dialect.name + ) splink_df_dict = db_api.register_multiple_tables(table_or_tables) return _count_comparisons_generated_from_blocking_rule( splink_df_dict=splink_df_dict, - blocking_rule=blocking_rule_creator_as_creator, + blocking_rule=blocking_rule_creator, link_type=link_type, db_api=db_api, compute_post_filter_count=compute_post_filter_count, @@ -588,7 +589,7 @@ def count_comparisons_from_blocking_rule( def cumulative_comparisons_to_be_scored_from_blocking_rules_data( *, table_or_tables: Sequence[AcceptableInputTableType], - blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], + blocking_rules: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str = "unique_id", @@ -597,20 +598,19 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( ) -> pd.DataFrame: splink_df_dict = db_api.register_multiple_tables(table_or_tables) - blocking_rule_creators = ensure_is_iterable(blocking_rule_creators) + # whilst they're named blocking_rules, this is actually a list of + # BlockingRuleCreators. The followign code turns them into BlockingRule objects + blocking_rules = ensure_is_iterable(blocking_rules) - blocking_rules: List[BlockingRule] = [] - for br in blocking_rule_creators: - if isinstance(br, BlockingRule): - blocking_rules.append(br) - else: - blocking_rules.append( - to_blocking_rule_creator(br).get_blocking_rule(db_api.sql_dialect.name) - ) + blocking_rules_as_br: List[BlockingRule] = [] + for br in blocking_rules: + blocking_rules_as_br.append( + to_blocking_rule_creator(br).get_blocking_rule(db_api.sql_dialect.name) + ) return _cumulative_comparisons_to_be_scored_from_blocking_rules( splink_df_dict=splink_df_dict, - blocking_rules=blocking_rules, + blocking_rules=blocking_rules_as_br, link_type=link_type, db_api=db_api, max_rows_limit=max_rows_limit, @@ -622,7 +622,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( *, table_or_tables: Sequence[AcceptableInputTableType], - blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], + blocking_rules: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, unique_id_column_name: str = "unique_id", @@ -631,20 +631,19 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_chart( ) -> ChartReturnType: splink_df_dict = db_api.register_multiple_tables(table_or_tables) - blocking_rule_creators = ensure_is_iterable(blocking_rule_creators) + # whilst they're named blocking_rules, this is actually a list of + # BlockingRuleCreators. The followign code turns them into BlockingRule objects + blocking_rules = ensure_is_iterable(blocking_rules) - blocking_rules: List[BlockingRule] = [] - for br in blocking_rule_creators: - if isinstance(br, BlockingRule): - blocking_rules.append(br) - else: - blocking_rules.append( - to_blocking_rule_creator(br).get_blocking_rule(db_api.sql_dialect.name) - ) + blocking_rules_as_br: List[BlockingRule] = [] + for br in blocking_rules: + blocking_rules_as_br.append( + to_blocking_rule_creator(br).get_blocking_rule(db_api.sql_dialect.name) + ) pd_df = _cumulative_comparisons_to_be_scored_from_blocking_rules( splink_df_dict=splink_df_dict, - blocking_rules=blocking_rules, + blocking_rules=blocking_rules_as_br, link_type=link_type, db_api=db_api, max_rows_limit=max_rows_limit, diff --git a/tests/test_analyse_blocking.py b/tests/test_analyse_blocking.py index 54999e11a9..fb20d57b3e 100644 --- a/tests/test_analyse_blocking.py +++ b/tests/test_analyse_blocking.py @@ -49,13 +49,13 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): } res_dict = count_comparisons_from_blocking_rule( - table_or_tables=df_1, blocking_rule_creator="1=1", **args + table_or_tables=df_1, blocking_rule="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 / 2 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=df_1, blocking_rule_creator=block_on("first_name"), **args + table_or_tables=df_1, blocking_rule=block_on("first_name"), **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] @@ -63,35 +63,35 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): args["link_type"] = "link_only" res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule_creator="1=1", **args + table_or_tables=[df_1, df_2], blocking_rule="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule_creator=block_on("surname"), **args + table_or_tables=[df_1, df_2], blocking_rule=block_on("surname"), **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 res_dict = count_comparisons_from_blocking_rule( table_or_tables=[df_1, df_2], - blocking_rule_creator=block_on("first_name"), + blocking_rule=block_on("first_name"), **args, ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 3 res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2, df_3], blocking_rule_creator="1=1", **args + table_or_tables=[df_1, df_2, df_3], blocking_rule="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 4 * 3 + 4 * 2 + 2 * 3 args["link_type"] = "link_and_dedupe" res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule_creator="1=1", **args + table_or_tables=[df_1, df_2], blocking_rule="1=1", **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] expected = 4 * 3 + (4 * 3 / 2) + (3 * 2 / 2) @@ -99,14 +99,14 @@ def test_analyse_blocking_slow_methodology(test_helpers, dialect): rule = "l.first_name = r.first_name and l.surname = r.surname" res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule_creator=rule, **args + table_or_tables=[df_1, df_2], blocking_rule=rule, **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 rule = block_on("first_name", "surname") res_dict = count_comparisons_from_blocking_rule( - table_or_tables=[df_1, df_2], blocking_rule_creator=rule, **args + table_or_tables=[df_1, df_2], blocking_rule=rule, **args ) res = res_dict["number_of_comparisons_to_be_scored_post_filter_conditions"] assert res == 1 @@ -155,7 +155,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): r1 = cumulative_comparisons_to_be_scored_from_blocking_rules_data( table_or_tables=df_concat, - blocking_rule_creators=[block_on("first_name")], + blocking_rules=[block_on("first_name")], db_api=db_api, unique_id_column_name="unique_id", source_dataset_column_name="src_dataset", @@ -164,7 +164,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): r2 = cumulative_comparisons_to_be_scored_from_blocking_rules_data( table_or_tables=[df_1, df_2], - blocking_rule_creators=[block_on("first_name")], + blocking_rules=[block_on("first_name")], db_api=db_api, unique_id_column_name="unique_id", link_type="link_only", @@ -189,7 +189,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): count_comparisons_from_blocking_rule( table_or_tables=df_concat_3, - blocking_rule_creator=block_on("first_name"), + blocking_rule=block_on("first_name"), link_type="dedupe_only", unique_id_column_name="unique_id", db_api=db_api, @@ -197,7 +197,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): r1 = count_comparisons_from_blocking_rule( table_or_tables=df_concat_3, - blocking_rule_creator=block_on("first_name"), + blocking_rule=block_on("first_name"), link_type="link_only", db_api=db_api, unique_id_column_name="unique_id", @@ -206,7 +206,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): r2 = count_comparisons_from_blocking_rule( table_or_tables=[df_1_no_sds, df_2_no_sds, df_3_no_sds], - blocking_rule_creator=block_on("first_name"), + blocking_rule=block_on("first_name"), link_type="link_only", db_api=db_api, unique_id_column_name="unique_id", @@ -222,7 +222,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): r1 = count_comparisons_from_blocking_rule( table_or_tables=df_concat_2, - blocking_rule_creator=block_on("first_name"), + blocking_rule=block_on("first_name"), link_type="link_only", db_api=db_api, unique_id_column_name="unique_id", @@ -231,7 +231,7 @@ def test_source_dataset_works_as_expected(test_helpers, dialect): r2 = count_comparisons_from_blocking_rule( table_or_tables=[df_1_no_sds, df_2_no_sds], - blocking_rule_creator=block_on("first_name"), + blocking_rule=block_on("first_name"), link_type="link_only", db_api=db_api, unique_id_column_name="unique_id", @@ -269,7 +269,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): comparison_count_args = { "table_or_tables": df, - "blocking_rule_creators": [block_on("first_name")], + "blocking_rules": [block_on("first_name")], "link_type": "dedupe_only", "db_api": db_api, "unique_id_column_name": "unique_id", @@ -292,7 +292,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): "l.first_name = r.first_name", ] - comparison_count_args["blocking_rule_creators"] = blocking_rules + comparison_count_args["blocking_rules"] = blocking_rules validate_blocking_output( comparison_count_args, @@ -309,7 +309,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): "l.dob = r.dob", ] - comparison_count_args["blocking_rule_creators"] = blocking_rules + comparison_count_args["blocking_rules"] = blocking_rules validate_blocking_output( comparison_count_args, @@ -350,7 +350,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): "link_type": "link_and_dedupe", "db_api": db_api, "unique_id_column_name": "unique_id", - "blocking_rule_creators": blocking_rules, + "blocking_rules": blocking_rules, "source_dataset_column_name": "source_dataset", } @@ -373,7 +373,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): ] comparison_count_args["link_type"] = "link_only" - comparison_count_args["blocking_rule_creators"] = blocking_rules + comparison_count_args["blocking_rules"] = blocking_rules validate_blocking_output( comparison_count_args, @@ -410,7 +410,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): "link_type": "link_and_dedupe", "db_api": db_api, "unique_id_column_name": "unique_id", - "blocking_rule_creators": [ + "blocking_rules": [ block_on("surname"), block_on("first_name"), ], @@ -427,7 +427,7 @@ def test_blocking_records_accuracy(test_helpers, dialect): ) comparison_count_args["link_type"] = "link_only" - comparison_count_args["blocking_rule_creators"] = [ + comparison_count_args["blocking_rules"] = [ block_on("surname"), block_on("first_name"), ] @@ -471,7 +471,7 @@ def test_analyse_blocking_fast_methodology(): "compute_post_filter_count": False, } - args["blocking_rule_creator"] = "1=1" + args["blocking_rule"] = "1=1" res_dict = count_comparisons_from_blocking_rule(**args) @@ -479,9 +479,7 @@ def test_analyse_blocking_fast_methodology(): assert res == 5 * 5 - args["blocking_rule_creator"] = ( - "l.first_name = r.first_name OR l.surname = r.surname" - ) + args["blocking_rule"] = "l.first_name = r.first_name OR l.surname = r.surname" res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] assert res == 5 * 5 @@ -491,7 +489,7 @@ def test_analyse_blocking_fast_methodology(): # ) # assert res == 3 * 3 + 1 * 1 + 1 * 1 - args["blocking_rule_creator"] = """l.first_name = r.first_name + args["blocking_rule"] = """l.first_name = r.first_name AND levenshtein(l.surname, r.surname) <2""" res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] @@ -499,7 +497,7 @@ def test_analyse_blocking_fast_methodology(): args["table_or_tables"] = [df_1, df_2] args["link_type"] = "link_and_dedupe" - args["blocking_rule_creator"] = block_on("first_name") + args["blocking_rule"] = block_on("first_name") res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] @@ -507,7 +505,7 @@ def test_analyse_blocking_fast_methodology(): assert res == 6 * 6 + 1 * 1 + 1 * 1 args["link_type"] = "link_only" - args["blocking_rule_creator"] = block_on("first_name") + args["blocking_rule"] = block_on("first_name") res_dict = count_comparisons_from_blocking_rule(**args) res = res_dict["number_of_comparisons_generated_pre_filter_conditions"] @@ -548,7 +546,7 @@ def test_analyse_blocking_fast_methodology_edge_cases(): for br in blocking_rules: res_dict = count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule_creator=br, + blocking_rule=br, link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", @@ -585,7 +583,7 @@ def test_analyse_blocking_fast_methodology_edge_cases(): for br in blocking_rules: res_dict = count_comparisons_from_blocking_rule( table_or_tables=[df_l, df_r], - blocking_rule_creator=br, + blocking_rule=br, link_type="link_only", db_api=db_api, unique_id_column_name="unique_id", @@ -621,7 +619,7 @@ def test_chart(test_helpers, dialect): cumulative_comparisons_to_be_scored_from_blocking_rules_chart( table_or_tables=df, - blocking_rule_creators=[block_on("first_name"), "l.surname = r.surname"], + blocking_rules=[block_on("first_name"), "l.surname = r.surname"], link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_full_example_deterministic_link.py b/tests/test_full_example_deterministic_link.py index b817a7b559..359318cf37 100644 --- a/tests/test_full_example_deterministic_link.py +++ b/tests/test_full_example_deterministic_link.py @@ -32,7 +32,7 @@ def test_deterministic_link_full_example(dialect, tmp_path, test_helpers): cumulative_comparisons_to_be_scored_from_blocking_rules_chart( table_or_tables=df, - blocking_rule_creators=br_for_predict, + blocking_rules=br_for_predict, link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_full_example_duckdb.py b/tests/test_full_example_duckdb.py index d40e7e5f45..0cc3dfb465 100644 --- a/tests/test_full_example_duckdb.py +++ b/tests/test_full_example_duckdb.py @@ -44,7 +44,7 @@ def test_full_example_duckdb(tmp_path): count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule_creator='l.first_name = r.first_name and l."SUR name" = r."SUR name"', # noqa: E501 + blocking_rule='l.first_name = r.first_name and l."SUR name" = r."SUR name"', # noqa: E501 link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_full_example_postgres.py b/tests/test_full_example_postgres.py index 86f7cfc75a..e0ef50a462 100644 --- a/tests/test_full_example_postgres.py +++ b/tests/test_full_example_postgres.py @@ -29,7 +29,7 @@ def test_full_example_postgres(tmp_path, pg_engine): count_comparisons_from_blocking_rule( table_or_tables=df, - blocking_rule_creator='l.first_name = r.first_name and l."surname" = r."surname"', # noqa: E501 + blocking_rule='l.first_name = r.first_name and l."surname" = r."surname"', # noqa: E501 link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", @@ -37,7 +37,7 @@ def test_full_example_postgres(tmp_path, pg_engine): cumulative_comparisons_to_be_scored_from_blocking_rules_chart( table_or_tables=df, - blocking_rule_creators=[ + blocking_rules=[ "l.first_name = r.first_name", "l.surname = r.surname", "l.city = r.city", diff --git a/tests/test_new_db_api.py b/tests/test_new_db_api.py index 4cb96b7866..5019b44a92 100644 --- a/tests/test_new_db_api.py +++ b/tests/test_new_db_api.py @@ -118,7 +118,7 @@ def test_charts(dialect, test_helpers, tmp_path): cumulative_comparisons_to_be_scored_from_blocking_rules_chart( table_or_tables=df, - blocking_rule_creators=[block_on("dob"), block_on("first_name")], + blocking_rules=[block_on("dob"), block_on("first_name")], link_type="dedupe_only", db_api=db_api, unique_id_column_name="unique_id", diff --git a/tests/test_total_comparison_count.py b/tests/test_total_comparison_count.py index 9513e5edcb..9f0ecab1d0 100644 --- a/tests/test_total_comparison_count.py +++ b/tests/test_total_comparison_count.py @@ -87,7 +87,7 @@ def make_dummy_frame(row_count): res_dict = count_comparisons_from_blocking_rule( table_or_tables=dfs, - blocking_rule_creator="1=1", + blocking_rule="1=1", link_type=link_type, db_api=db_api, unique_id_column_name="unique_id", From a6b31767e270537b1056782c536ac69f8dfef7c4 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 16 May 2024 14:40:55 +0100 Subject: [PATCH 59/59] rename api --- docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb | 2 +- docs/demos/tutorials/03_Blocking.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb index 0e800cf16c..f250b921f0 100644 --- a/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb +++ b/docs/demos/examples/duckdb/deduplicate_50k_synthetic.ipynb @@ -397,7 +397,7 @@ "\n", "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", " table_or_tables=df,\n", - " blocking_rule_creators=blocking_rules,\n", + " blocking_rules=blocking_rules,\n", " db_api=db_api,\n", " link_type=\"dedupe_only\",\n", ")" diff --git a/docs/demos/tutorials/03_Blocking.ipynb b/docs/demos/tutorials/03_Blocking.ipynb index 20f2ab0243..5526d2bb52 100644 --- a/docs/demos/tutorials/03_Blocking.ipynb +++ b/docs/demos/tutorials/03_Blocking.ipynb @@ -219,7 +219,7 @@ "for br in blocking_rules_for_analysis:\n", " counts = count_comparisons_from_blocking_rule(\n", " table_or_tables=df,\n", - " blocking_rule_creator=br,\n", + " blocking_rule=br,\n", " link_type=\"dedupe_only\",\n", " db_api=db_api,\n", " )\n", @@ -513,4 +513,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}