Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue 1651 - comparison viewer bars sorted improperly #1652

Merged
merged 5 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions splink/files/splink_vis_utils/splink_vis_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -10288,7 +10288,7 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
{
encoding: {
color: {
field: "match_probability",
field: "avg_match_probability",
scale: {
domain: [
0,
Expand Down Expand Up @@ -10317,14 +10317,16 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
type: "quantitative"
},
{
field: "match_probability",
field: "avg_match_probability",
type: "quantitative",
format: ",.1%"
format: ",.1%",
title: "Match probability"
},
{
field: "match_weight",
type: "quantitative",
format: ",.2f"
format: ",.2f",
title: "Match weight"
},
{
field: "sum_matches",
Expand All @@ -10335,17 +10337,12 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
field: "proportion_of_comparisons",
type: "quantitative",
format: ",.1%"
},
{
field: "cumulative_comparisons",
type: "quantitative",
format: ",.1%"
}
],
x: {
field: "gam_concat",
sort: {
field: "match_weight",
field: "sort_avg_match_weight",
op: "sum",
order: "ascending"
},
Expand Down Expand Up @@ -10544,7 +10541,7 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
{
encoding: {
color: {
field: "match_probability",
field: "avg_match_probability",
legend: null,
scale: {
domain: [
Expand All @@ -10563,7 +10560,7 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
x: {
field: "gam_concat",
sort: {
field: "match_weight",
field: "sort_avg_match_weight",
op: "sum",
order: "ascending"
},
Expand Down Expand Up @@ -10636,10 +10633,13 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
let sort_field;
data.forEach((d) => {
d.sum_matches = d.match_probability * d.count;

const bf = Math.pow(2, d.sort_avg_match_weight);
d.avg_match_probability = bf / (1 + bf);
});
if (sort_bars == "sort_match_weight") {
data.sort(sort_match_weight);
sort_field = "match_weight";
sort_field = "sort_avg_match_weight";
}
if (sort_bars == "sort_sum_matches") {

Expand Down Expand Up @@ -10693,6 +10693,7 @@ ${splink_vis_utils.comparison_column_table(selected_edge, ss)}`;
row["bayes_factor"] = d[`bf_${data_col_name}`];
const log2 = Math.log2;
row["match_weight"] = log2(d[`bf_${data_col_name}`]);
row["sort_avg_match_weight"] = d["sort_avg_match_weight"];

row["label_for_charts"] = settings_col.comparison_level_lookup[row["gam_value"]]["label_for_charts"];
row["sql_condition"] = settings_col.comparison_level_lookup[row["gam_value"]]["sql_condition"];
Expand Down
11 changes: 11 additions & 0 deletions splink/splink_comparison_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from jinja2 import Template

from .misc import EverythingEncoder, read_resource
from .predict import _combine_prior_and_bfs

# https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
if TYPE_CHECKING:
Expand All @@ -26,11 +27,21 @@ def row_examples(linker: Linker, example_rows_per_category=2):

gam_concat = " || ',' || ".join(gamma_columns)

# See https://github.com/moj-analytical-services/splink/issues/1651
# This ensures we have an average match weight that isn't affected by tf
bf_columns_no_tf = [c._bf_column_name for c in linker._settings_obj.comparisons]

p = linker._settings_obj._probability_two_random_records_match
bf_final_no_tf = _combine_prior_and_bfs(
p, bf_terms=bf_columns_no_tf, sql_infinity_expr=linker._infinity_expression
)[0]

sql = f"""
select
*,
{uid_expr} as rec_comparison_id,
{gam_concat} as gam_concat,
log2({bf_final_no_tf}) as sort_avg_match_weight,
random() as rand_order
from __splink__df_predict
"""
Expand Down
Loading