diff --git a/scripts/reduce_notebook_runtime.py b/scripts/reduce_notebook_runtime.py index 7bdb4b4dc2..dff19e7c05 100644 --- a/scripts/reduce_notebook_runtime.py +++ b/scripts/reduce_notebook_runtime.py @@ -19,6 +19,13 @@ def modify_notebook(file_path): data["cells"] = data["cells"][:19] changed = True + if "sqlite" in file_path: + max_pairs = 3e5 + head_num = 800 + else: + max_pairs = 1e5 + head_num = 400 + for cell in data["cells"]: if cell["cell_type"] == "code": source = cell["source"] @@ -26,13 +33,17 @@ def modify_notebook(file_path): for line in source: if "splink_datasets" in line and "=" in line: parts = line.split("=") - parts[1] = parts[1].strip() + ".head(400)" + parts[1] = parts[1].strip() + f".head({head_num})" new_line = " = ".join(parts) + "\n" new_source.append(new_line) changed = True elif "estimate_u_using_random_sampling(" in line: new_line = ( - re.sub(r"max_pairs=\d+(\.\d+)?[eE]\d+", "max_pairs=1e5", line) + re.sub( + r"max_pairs=\d+(\.\d+)?[eE]\d+", + f"max_pairs={max_pairs}", + line, + ) + "\n" ) new_source.append(new_line) diff --git a/splink/internals/blocking_analysis.py b/splink/internals/blocking_analysis.py index 9807fb4b51..a1e47a0ae2 100644 --- a/splink/internals/blocking_analysis.py +++ b/splink/internals/blocking_analysis.py @@ -545,7 +545,7 @@ def count_comparisons_from_blocking_rule( blocking_rule_creator: Union[BlockingRuleCreator, str, Dict[str, Any]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, - unique_id_column_name: str, + unique_id_column_name: str = "unqiue_id", source_dataset_column_name: Optional[str] = None, compute_post_filter_count: bool = True, max_rows_limit: int = int(1e9), @@ -574,7 +574,7 @@ def cumulative_comparisons_to_be_scored_from_blocking_rules_data( blocking_rule_creators: Iterable[Union[BlockingRuleCreator, str, Dict[str, Any]]], link_type: user_input_link_type_options, db_api: DatabaseAPISubClass, - unique_id_column_name: str, + unique_id_column_name: str = "unique_id", max_rows_limit: int = int(1e9), source_dataset_column_name: Optional[str] = None, ) -> pd.DataFrame: