Skip to content

Commit

Permalink
Define estimator failure & write script
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebd99 committed Dec 22, 2023
1 parent 999c880 commit 03ec42e
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 80 deletions.
41 changes: 17 additions & 24 deletions Experiments/Experiments.jl
Original file line number Diff line number Diff line change
@@ -1,35 +1,28 @@
# Overall Experiments Harness Include File
using Serialization: serialize, deserialize
using BenchmarkTools
using Plots
using Plots.PlotMeasures
using StatsPlots
using CSV, DataFrames
using Parquet2: Dataset
using DelimitedFiles: writedlm
using BenchmarkTools
using Random
using Printf
using SharedArrays
using WeakRefStrings
using Random
using StatsPlots
using Distributed
@everywhere using CSV
@everywhere using DataFrames
@everywhere using DelimitedFiles: writedlm
@everywhere using Parquet2: Dataset
@everywhere using Random
@everywhere using Serialization: serialize, deserialize
@everywhere using SharedArrays
@everywhere using WeakRefStrings


include("../Source/CardinalityWithColors.jl")
include("utils.jl")
include("load_datasets.jl")
include("load_querysets.jl")
include("build_color_summaries.jl")
include("get_true_cardinalities.jl")
include("run_estimators.jl")
include("graph_results.jl")
@everywhere include("../Source/CardinalityWithColors.jl")
@everywhere include("utils.jl")
@everywhere include("load_datasets.jl")
include("load_querysets.jl")
@everywhere include("build_color_summaries.jl")
include("get_true_cardinalities.jl")
@everywhere include("run_estimators.jl")
@everywhere using SharedArrays
@everywhere using WeakRefStrings
@everywhere using DelimitedFiles: writedlm
@everywhere using Parquet2: Dataset
@everywhere using Random
@everywhere using CSV, DataFrames
@everywhere using Serialization: serialize, deserialize
include("graph_results.jl")

const TIMEOUT_SEC::Float64 = 60.0
11 changes: 8 additions & 3 deletions Experiments/Scripts/comparison_exps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include("../Experiments.jl")

#datasets = [human, aids, lubm80, yeast, hprd, dblp, youtube, eu2005, patents, wordnet]
datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
#datasets = [human, aids, lubm80]

experiment_params = Vector{ExperimentParams}()
for dataset in datasets
Expand All @@ -15,7 +16,7 @@ for dataset in datasets
dataset=dataset,
partitioning_scheme=[(QuasiStable, 32), (NeighborNodeLabels, 32),(QuasiStable, 32), (NeighborNodeLabels, 32)],
description = "AvgQ64N64"))

#=
push!(experiment_params, ExperimentParams(deg_stats_type=MinDegStats,
dataset=dataset,
partitioning_scheme=[(QuasiStable, 64)],
Expand All @@ -31,25 +32,28 @@ for dataset in datasets
dataset=dataset,
partitioning_scheme=[(Hash, 64)],
max_cycle_size = -1,
inference_max_paths = 10^30,
use_partial_sums = false,
description = "BSK"))
push!(experiment_params, ExperimentParams(deg_stats_type=AvgDegStats,
dataset=dataset,
partitioning_scheme=[(QuasiStable, 1)],
max_cycle_size = -1,
description = "IndEst"))
description = "IndEst")) =#
end

#build_experiments(experiment_params)

#run_estimation_experiments(experiment_params)
run_estimation_experiments(experiment_params; timeout=1.0)

order = [string(data) for data in datasets]

graph_grouped_boxplot_with_comparison_methods(experiment_params;
ylims=[10^-5, 10^4],
y_ticks=[10^-5, 10^-4, 10^-3, 10^-2, 10^-1, 10^0, 10^1, 10^2, 10^3, 10^4],
y_type = runtime,
x_type = dataset,
x_order = order,
grouping=description,
dimensions = (1450, 550),
Expand All @@ -61,6 +65,7 @@ graph_grouped_boxplot_with_comparison_methods(experiment_params;
ylims=[10^-21, 10^21],
y_ticks=[10^-20, 10^-15, 10^-10, 10^-5, 10^-2, 10^0, 10^2, 10^5, 10^10, 10^15, 10^20],
y_type = estimate_error,
x_type = dataset,
x_order = order,
grouping=description,
dimensions = (1450, 550),
Expand Down
94 changes: 94 additions & 0 deletions Experiments/Scripts/estimator-failure.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
include("../Experiments.jl")

#datasets = [human, aids]
datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
queries = load_querysets(datasets)
num_queries = Dict(string(dataset)=>length(queries[dataset]) for dataset in datasets)

methods, comparison_results = comparison_dataset()

failure_counts = Dict()
failure_probabilities = Dict()
for method in methods
failure_counts[method] = counter(String)
failure_probabilities[method] = Dict()
for dataset in datasets
string_dataset = string(dataset)
for query in queries[dataset]
qid = get_query_id(string_dataset, query.query_path)
comp_key = (string_dataset, method, qid)
if !haskey(comparison_results, comp_key)
inc!(failure_counts[method], string_dataset)
elseif comparison_results[comp_key].Estimate == 0
inc!(failure_counts[method], string_dataset)
elseif comparison_results[comp_key].Estimate == Inf
inc!(failure_counts[method], string_dataset)
elseif comparison_results[comp_key].Estimate == NaN
inc!(failure_counts[method], string_dataset)
end
end
failure_probabilities[method][dataset] = failure_counts[method][string_dataset] / num_queries[string_dataset]
end
end

failure_counts["BSK"] = counter(String)
failure_counts["BSK++"] = counter(String)
failure_counts["AvgQ64"] = counter(String)
failure_probabilities["BSK"] = Dict()
failure_probabilities["BSK++"] = Dict()
failure_probabilities["AvgQ64"] = Dict()
for dataset in datasets
string_dataset = string(dataset)
bsk_params = ExperimentParams(deg_stats_type=MaxDegStats,
dataset=dataset,
partitioning_scheme=[(Hash, 64)],
max_cycle_size = -1,
inference_max_paths = 10^30,
use_partial_sums = false,
description = "BSK",
n_replications = 1)
run_estimation_experiments([bsk_params]; timeout=TIMEOUT_SEC)
bsk_filename = params_to_results_filename(bsk_params)
bsk_path = "Experiments/Results/Estimation_" * bsk_filename
bsk_df = CSV.read(bsk_path, DataFrame; normalizenames=true)
for i in 1:nrow(bsk_df)
if bsk_df[i, :Failure]
inc!(failure_counts["BSK"], string_dataset)
end
end
failure_probabilities["BSK"][string_dataset] = failure_counts["BSK"][string_dataset] / num_queries[string_dataset]


bsk_agg_params = ExperimentParams(deg_stats_type=MaxDegStats,
dataset=dataset,
partitioning_scheme=[(Hash, 64)],
max_cycle_size = -1,
inference_max_paths = 10^30,
use_partial_sums = true,
description = "BSK++",
n_replications=1)
run_estimation_experiments([bsk_agg_params]; timeout=TIMEOUT_SEC)
bsk_agg_filename = params_to_results_filename(bsk_agg_params)
bsk_agg_path = "Experiments/Results/Estimation_" * bsk_agg_filename
bsk_agg_df = CSV.read(bsk_agg_path, DataFrame; normalizenames=true)
for i in 1:nrow(bsk_agg_df)
if bsk_agg_df[i, :Failure]
inc!(failure_counts["BSK++"], string_dataset)
end
end
failure_probabilities["BSK++"][string_dataset] = failure_counts["BSK++"][string_dataset] / num_queries[string_dataset]



avg_params = ExperimentParams(dataset=dataset, n_replications=1)
run_estimation_experiments([avg_params]; timeout=TIMEOUT_SEC)
avg_filename = params_to_results_filename(avg_params)
avg_path = "Experiments/Results/Estimation_" * avg_filename
avg_df = CSV.read(avg_path, DataFrame; normalizenames=true)
for i in 1:nrow(avg_df)
if avg_df[i, :Failure]
inc!(failure_counts["AvgQ64"], string_dataset)
end
end
failure_probabilities["AvgQ64"][string_dataset] = failure_counts["AvgQ64"][string_dataset] / num_queries[string_dataset]
end
45 changes: 27 additions & 18 deletions Experiments/Scripts/run-cycle-experiments.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,55 @@ max_cycles = 6
experiment_params_list = ExperimentParams[ExperimentParams(dataset=current_dataset, max_cycle_size=current_size) for current_dataset in datasets for current_size in 1:max_cycles]

# println("started building")
build_experiments(experiment_params_list)
#build_experiments(experiment_params_list)
# println("started estimating")
run_estimation_experiments(experiment_params_list)
#run_estimation_experiments(experiment_params_list)
println("started graphing")

graph_grouped_box_plot(experiment_params_list;
x_type=dataset,
x_type=cycle_size,
ylims=[10^-20, 10^15],
y_ticks =[10^-20,10^-15, 10^-10, 10^-5, 1, 10^5, 10^10, 10^15],
grouping=cycle_size,
grouping=dataset,
y_type=estimate_error,
y_label="Relative Error 10^ (s)",
filename="cycles_size_error")
dimensions = (600, 400),
legend_pos = nothing,
y_label="Relative Error 10^ ",
x_label="Max Cycle Size",
filename="cycles-size-error")

graph_grouped_box_plot(experiment_params_list;
x_type=dataset,
x_type=cycle_size,
ylims=[10^-3, 10^1],
y_ticks =[10^-3, 10^-2, 10^-1, 1, 10],
grouping=cycle_size,
grouping=dataset,
y_type=runtime,
y_label="Seconds 10^ (s)",
filename="cycles_size_runtime")
dimensions = (600, 400),
legend_pos = nothing,
y_label="Inference Latency 10^ (s)",
x_label="Max Cycle Size",
filename="cycles-size-runtime")

graph_grouped_bar_plot(experiment_params_list,
x_type=dataset,
x_type=cycle_size,
y_type=memory_footprint,
ylims=[0, 15.5],
y_ticks =[3, 6, 9, 12, 15],
dimensions = (600, 400),
y_label="Memory Footprint (MB)",
grouping=cycle_size,
filename="cycles_size_memory")
y_label="Statistics Size (MB)",
x_label="Max Cycle Size",
grouping=dataset,
legend_pos = nothing,
filename="cycles-size-memory")

graph_grouped_bar_plot(experiment_params_list,
x_type=dataset,
x_type=cycle_size,
y_type=build_time,
ylims = [0, 4],
y_ticks = [1, 2, 3, 4],
dimensions = (600, 400),
legend_pos = :topright,
y_label="Build Time (s)",
grouping=cycle_size,
filename="cycles_size_build_time")
x_label="Max Cycle Size",
grouping=dataset,
legend_pos = nothing,
filename="cycles-size-build-time")
36 changes: 18 additions & 18 deletions Experiments/graph_results.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,24 +218,24 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
# This seems to be necessary for using Plots.jl outside of the ipynb framework.
# See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
ENV["GKSwstype"]="100"
gbplot = groupedboxplot(x_values,
[log10(y) for y in y_values],
group = estimators,
x_ticks = x_ticks,
xlims = [0.5, length(x_order)+.5],
ylims = (log10(ylims[1]),log10(ylims[2])),
y_ticks = [log10(y) for y in y_ticks],
legend = legend_pos,
size = dimensions,
bottom_margin = 40px,
top_margin = 20px,
left_margin = 10mm,
legend_column = 2,
titlefont = (12, :black),
legendfont = (11, :black),
tickfont = (12, :black),
guidefont = (15, :black),
whisker_range=2)
gbplot = groupedboxplot(x_values,
[log10(y) for y in y_values],
group = estimators,
x_ticks = x_ticks,
xlims = [0.5, length(x_order)+.5],
ylims = (log10(ylims[1]),log10(ylims[2])),
y_ticks = [log10(y) for y in y_ticks],
legend = legend_pos,
size = dimensions,
bottom_margin = 40px,
top_margin = 20px,
left_margin = 10mm,
legend_column = 2,
titlefont = (12, :black),
legendfont = (11, :black),
tickfont = (12, :black),
guidefont = (15, :black),
whisker_range=2)
x_label !== nothing && xlabel!(gbplot, x_label)
y_label !== nothing && ylabel!(gbplot, y_label)
y_type == estimate_error && hline!([0], label="exact", linestyle=:solid, lw=2)
Expand Down
27 changes: 18 additions & 9 deletions Experiments/run_estimators.jl
Original file line number Diff line number Diff line change
@@ -1,30 +1,39 @@
function run_estimation_experiments(experiment_params_list::Vector{ExperimentParams})
function run_estimation_experiments(experiment_params_list::Vector{ExperimentParams}; timeout::Float64=Inf)
for experiment_params in experiment_params_list
dataset = experiment_params.dataset
replications = experiment_params.n_replications
println(replications)
all_queries = load_querysets([dataset]; require_true_cardinality = true)
summary_file_location = "Experiments/SerializedSummaries/" * params_to_summary_filename(experiment_params)
!isfile(summary_file_location) && error("The summary has not been built yet! \n Attempted File Location: $(summary_file_location)")
summary::ColorSummary = deserialize(summary_file_location)
experiment_results = SharedArray{Tuple{Float64, Float64, Float64, String255, String255, Float64}}(length(all_queries[dataset]))
experiment_results = SharedArray{Tuple{Float64, Float64, Float64, String255, String255, Float64, Bool}}(length(all_queries[dataset]))
@sync @distributed for i in shuffle(collect(eachindex(experiment_results)))
query::QueryGraph = all_queries[dataset][i].query
query_path = all_queries[dataset][i].query_path
exact_size = all_queries[dataset][i].exact_size
estimate_results = [(@timed get_cardinality_bounds(query, summary;
estimate_results = [(@timed get_cardinality_bounds(query,
summary;
max_partial_paths = experiment_params.inference_max_paths,
use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
use_partial_sums=experiment_params.use_partial_sums,
usingStoredStats=true,
sampling_strategy=experiment_params.sampling_strategy,
only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)) for _ in 1:3]
only_shortest_path_cycle= experiment_params.only_shortest_path_cycle,
timeout=timeout)) for _ in 1:replications]
estimate_time = median([x.time for x in estimate_results]) # Convert back to seconds from nano seconds
estimate_failure = isnan(estimate_results[1].value) || isinf(estimate_results[1].value) || estimate_results[1].value == 0
estimate = max(1, estimate_results[1].value)
if isinf(estimate) || isnan(estimate)
if isinf(estimate)
estimate = 10^35
end
if isnan(estimate)
estimate = 1.0
end
query_type = all_queries[dataset][i].query_type
experiment_results[i] = (estimate, exact_size, estimate_time, query_type, query_path, nv(query.graph))
experiment_results[i] = (estimate, exact_size, estimate_time, query_type, query_path, nv(query.graph), estimate_failure)
end
final_results = [(x[1], x[2], x[3], String(x[4]), String(x[5]), x[6]) for x in experiment_results]
final_results = [("Estimate", "TrueCard", "EstimationTime", "QueryType", "QueryPath", "QuerySize"); final_results]
final_results = [(x[1], x[2], x[3], String(x[4]), String(x[5]), x[6], x[7]) for x in experiment_results]
final_results = [("Estimate", "TrueCard", "EstimationTime", "QueryType", "QueryPath", "QuerySize", "Failure"); final_results]
results_file_location = "Experiments/Results/Estimation_" * params_to_results_filename(experiment_params)
writedlm(results_file_location, final_results, ",")
end
Expand Down
6 changes: 4 additions & 2 deletions Experiments/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ struct ExperimentParams
use_partial_sums::Bool
sampling_strategy::SAMPLING_STRATEGY
description::String
n_replications::Int

function ExperimentParams(;dataset::DATASET, max_cycle_size=6,
only_shortest_path_cycle=false, summary_max_paths=1000,
partitioning_scheme::Vector{Tuple{PARTITIONER, Int}} = [(QuasiStable, 64)], weighting=true,
inference_max_paths=500, use_partial_sums=true,
sampling_strategy=redistributive, proportion_updated=0.0, proportion_deleted=0.0,
deg_stats_type::Type=AvgDegStats, description="")
deg_stats_type::Type=AvgDegStats, description="", n_replications=3)
return new(dataset,
ColorSummaryParams(deg_stats_type=deg_stats_type,
max_cycle_size=max_cycle_size,
Expand All @@ -32,7 +33,8 @@ struct ExperimentParams
only_shortest_path_cycle,
use_partial_sums,
sampling_strategy,
description
description,
n_replications
)
end
end
Expand Down
Loading

0 comments on commit 03ec42e

Please sign in to comment.