Skip to content

Commit

Permalink
unlinkables
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed Mar 25, 2024
1 parent d1aaef6 commit 99bea33
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions splink/unlinkables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import TYPE_CHECKING

from .pipeline import CTEPipeline

# https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
if TYPE_CHECKING:
from .linker import Linker
Expand All @@ -19,14 +21,16 @@ def unlinkables_data(linker: Linker):

self_link = linker._self_link()

pipeline = CTEPipeline(reusable=False)

sql = f"""
select
round(match_weight, 2) as match_weight,
round(match_probability, 5) as match_probability
from {self_link.physical_name}
"""

linker._enqueue_sql(sql, "__splink__df_round_self_link")
pipeline.enqueue_sql(sql, "__splink__df_round_self_link")

sql = """
select
Expand All @@ -38,16 +42,16 @@ def unlinkables_data(linker: Linker):
order by match_probability
"""

linker._enqueue_sql(sql, "__splink__df_unlinkables_proportions")
pipeline.enqueue_sql(sql, "__splink__df_unlinkables_proportions")

sql = """
select *,
sum(prop) over(order by match_probability) as cum_prop
from __splink__df_unlinkables_proportions
where match_probability < 1
"""
linker._enqueue_sql(sql, "__splink__df_unlinkables_proportions_cumulative")
data = linker._execute_sql_pipeline(use_cache=False)
pipeline.enqueue_sql(sql, "__splink__df_unlinkables_proportions_cumulative")
data = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline, use_cache=False)

unlinkables_dict = data.as_record_dict()
data.drop_table_from_database_and_remove_from_cache()
Expand Down

0 comments on commit 99bea33

Please sign in to comment.