Define estimator failure & write script

uwdb · Dec 22, 2023 · 03ec42e · 03ec42e
1 parent 999c880
commit 03ec42e
Show file tree

Hide file tree

Showing 9 changed files with 216 additions and 80 deletions.
diff --git a/Experiments/Experiments.jl b/Experiments/Experiments.jl
@@ -1,35 +1,28 @@
 # Overall Experiments Harness Include File
-using Serialization: serialize, deserialize
+using BenchmarkTools
 using Plots
 using Plots.PlotMeasures
-using StatsPlots
-using CSV, DataFrames
-using Parquet2: Dataset
-using DelimitedFiles: writedlm
-using BenchmarkTools
-using Random
 using Printf
-using SharedArrays
-using WeakRefStrings
+using Random
+using StatsPlots
 using Distributed
+@everywhere using CSV
+@everywhere using DataFrames
+@everywhere using DelimitedFiles: writedlm
+@everywhere using Parquet2: Dataset
+@everywhere using Random
+@everywhere using Serialization: serialize, deserialize
+@everywhere using SharedArrays
+@everywhere using WeakRefStrings
+
 
-include("../Source/CardinalityWithColors.jl")
-include("utils.jl")
-include("load_datasets.jl")
-include("load_querysets.jl")
-include("build_color_summaries.jl")
-include("get_true_cardinalities.jl")
-include("run_estimators.jl")
-include("graph_results.jl")
 @everywhere include("../Source/CardinalityWithColors.jl")
 @everywhere include("utils.jl")
 @everywhere include("load_datasets.jl")
+include("load_querysets.jl")
 @everywhere include("build_color_summaries.jl")
+include("get_true_cardinalities.jl")
 @everywhere include("run_estimators.jl")
-@everywhere using SharedArrays
-@everywhere using WeakRefStrings
-@everywhere using DelimitedFiles: writedlm
-@everywhere using Parquet2: Dataset
-@everywhere using Random
-@everywhere using CSV, DataFrames
-@everywhere using Serialization: serialize, deserialize
+include("graph_results.jl")
+
+const TIMEOUT_SEC::Float64 = 60.0
diff --git a/Experiments/Scripts/comparison_exps.jl b/Experiments/Scripts/comparison_exps.jl
@@ -4,6 +4,7 @@ include("../Experiments.jl")
 
 #datasets = [human, aids, lubm80, yeast, hprd, dblp, youtube, eu2005, patents, wordnet]
 datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
+#datasets = [human, aids, lubm80]
 
 experiment_params = Vector{ExperimentParams}()
 for dataset in datasets
@@ -15,7 +16,7 @@ for dataset in datasets
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 32), (NeighborNodeLabels, 32),(QuasiStable, 32), (NeighborNodeLabels, 32)],
                                                 description = "AvgQ64N64"))
-
+#=
     push!(experiment_params, ExperimentParams(deg_stats_type=MinDegStats,
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 64)],
@@ -31,25 +32,28 @@ for dataset in datasets
                                                 dataset=dataset,
                                                 partitioning_scheme=[(Hash, 64)],
                                                 max_cycle_size = -1,
+                                                inference_max_paths = 10^30,
+                                                use_partial_sums = false,
                                                 description = "BSK"))
 
     push!(experiment_params, ExperimentParams(deg_stats_type=AvgDegStats,
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 1)],
                                                 max_cycle_size = -1,
-                                                description = "IndEst"))
+                                                description = "IndEst")) =#
 end
 
 #build_experiments(experiment_params)
 
-#run_estimation_experiments(experiment_params)
+run_estimation_experiments(experiment_params; timeout=1.0)
 
 order = [string(data) for data in datasets]
 
 graph_grouped_boxplot_with_comparison_methods(experiment_params;
                                                 ylims=[10^-5, 10^4],
                                                 y_ticks=[10^-5, 10^-4, 10^-3, 10^-2, 10^-1, 10^0, 10^1, 10^2, 10^3, 10^4],
                                                 y_type = runtime,
+                                                x_type = dataset,
                                                 x_order = order,
                                                 grouping=description,
                                                 dimensions = (1450, 550),
@@ -61,6 +65,7 @@ graph_grouped_boxplot_with_comparison_methods(experiment_params;
                                                 ylims=[10^-21, 10^21],
                                                 y_ticks=[10^-20, 10^-15, 10^-10, 10^-5, 10^-2, 10^0, 10^2, 10^5, 10^10, 10^15, 10^20],
                                                 y_type = estimate_error,
+                                                x_type = dataset,
                                                 x_order = order,
                                                 grouping=description,
                                                 dimensions = (1450, 550),

diff --git a/Experiments/Scripts/estimator-failure.jl b/Experiments/Scripts/estimator-failure.jl
@@ -0,0 +1,94 @@
+include("../Experiments.jl")
+
+#datasets = [human, aids]
+datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
+queries = load_querysets(datasets)
+num_queries = Dict(string(dataset)=>length(queries[dataset]) for dataset in datasets)
+
+methods, comparison_results = comparison_dataset()
+
+failure_counts = Dict()
+failure_probabilities = Dict()
+for method in methods
+    failure_counts[method] = counter(String)
+    failure_probabilities[method] = Dict()
+    for dataset in datasets
+        string_dataset = string(dataset)
+        for query in queries[dataset]
+            qid = get_query_id(string_dataset, query.query_path)
+            comp_key = (string_dataset, method, qid)
+            if !haskey(comparison_results, comp_key)
+                inc!(failure_counts[method], string_dataset)
+            elseif comparison_results[comp_key].Estimate == 0
+                inc!(failure_counts[method], string_dataset)
+            elseif comparison_results[comp_key].Estimate == Inf
+                inc!(failure_counts[method], string_dataset)
+            elseif comparison_results[comp_key].Estimate == NaN
+                inc!(failure_counts[method], string_dataset)
+            end
+        end
+        failure_probabilities[method][dataset] = failure_counts[method][string_dataset] / num_queries[string_dataset]
+    end
+end
+
+failure_counts["BSK"] = counter(String)
+failure_counts["BSK++"] = counter(String)
+failure_counts["AvgQ64"] = counter(String)
+failure_probabilities["BSK"] = Dict()
+failure_probabilities["BSK++"] = Dict()
+failure_probabilities["AvgQ64"] = Dict()
+for dataset in datasets
+    string_dataset = string(dataset)
+    bsk_params = ExperimentParams(deg_stats_type=MaxDegStats,
+                                    dataset=dataset,
+                                    partitioning_scheme=[(Hash, 64)],
+                                    max_cycle_size = -1,
+                                    inference_max_paths = 10^30,
+                                    use_partial_sums = false,
+                                    description = "BSK",
+                                    n_replications = 1)
+    run_estimation_experiments([bsk_params]; timeout=TIMEOUT_SEC)
+    bsk_filename = params_to_results_filename(bsk_params)
+    bsk_path = "Experiments/Results/Estimation_" * bsk_filename
+    bsk_df = CSV.read(bsk_path, DataFrame; normalizenames=true)
+    for i in 1:nrow(bsk_df)
+        if bsk_df[i, :Failure]
+            inc!(failure_counts["BSK"], string_dataset)
+        end
+    end
+    failure_probabilities["BSK"][string_dataset] = failure_counts["BSK"][string_dataset] / num_queries[string_dataset]
+
+
+    bsk_agg_params = ExperimentParams(deg_stats_type=MaxDegStats,
+                                    dataset=dataset,
+                                    partitioning_scheme=[(Hash, 64)],
+                                    max_cycle_size = -1,
+                                    inference_max_paths = 10^30,
+                                    use_partial_sums = true,
+                                    description = "BSK++",
+                                    n_replications=1)
+    run_estimation_experiments([bsk_agg_params]; timeout=TIMEOUT_SEC)
+    bsk_agg_filename = params_to_results_filename(bsk_agg_params)
+    bsk_agg_path = "Experiments/Results/Estimation_" * bsk_agg_filename
+    bsk_agg_df = CSV.read(bsk_agg_path, DataFrame; normalizenames=true)
+    for i in 1:nrow(bsk_agg_df)
+        if bsk_agg_df[i, :Failure]
+            inc!(failure_counts["BSK++"], string_dataset)
+        end
+    end
+    failure_probabilities["BSK++"][string_dataset] = failure_counts["BSK++"][string_dataset] / num_queries[string_dataset]
+
+
+
+    avg_params = ExperimentParams(dataset=dataset, n_replications=1)
+    run_estimation_experiments([avg_params]; timeout=TIMEOUT_SEC)
+    avg_filename = params_to_results_filename(avg_params)
+    avg_path = "Experiments/Results/Estimation_" * avg_filename
+    avg_df = CSV.read(avg_path, DataFrame; normalizenames=true)
+    for i in 1:nrow(avg_df)
+        if avg_df[i, :Failure]
+            inc!(failure_counts["AvgQ64"], string_dataset)
+        end
+    end
+    failure_probabilities["AvgQ64"][string_dataset] = failure_counts["AvgQ64"][string_dataset] / num_queries[string_dataset]
+end
diff --git a/Experiments/Scripts/run-cycle-experiments.jl b/Experiments/Scripts/run-cycle-experiments.jl
@@ -7,46 +7,55 @@ max_cycles = 6
 experiment_params_list = ExperimentParams[ExperimentParams(dataset=current_dataset, max_cycle_size=current_size) for current_dataset in datasets for current_size in 1:max_cycles]
 
 # println("started building")
-build_experiments(experiment_params_list)
+#build_experiments(experiment_params_list)
 # println("started estimating")
-run_estimation_experiments(experiment_params_list)
+#run_estimation_experiments(experiment_params_list)
 println("started graphing")
 
 graph_grouped_box_plot(experiment_params_list;
-                        x_type=dataset,
+                        x_type=cycle_size,
                         ylims=[10^-20, 10^15],
                         y_ticks =[10^-20,10^-15, 10^-10, 10^-5, 1, 10^5, 10^10, 10^15],
-                        grouping=cycle_size,
+                        grouping=dataset,
                         y_type=estimate_error,
-                        y_label="Relative Error 10^ (s)",
-                        filename="cycles_size_error")
+                        dimensions = (600, 400),
+                        legend_pos = nothing,
+                        y_label="Relative Error 10^ ",
+                        x_label="Max Cycle Size",
+                        filename="cycles-size-error")
 
 graph_grouped_box_plot(experiment_params_list;
-                        x_type=dataset,
+                        x_type=cycle_size,
                         ylims=[10^-3, 10^1],
                         y_ticks =[10^-3, 10^-2, 10^-1, 1, 10],
-                        grouping=cycle_size,
+                        grouping=dataset,
                         y_type=runtime,
-                        y_label="Seconds 10^ (s)",
-                        filename="cycles_size_runtime")
+                        dimensions = (600, 400),
+                        legend_pos = nothing,
+                        y_label="Inference Latency 10^ (s)",
+                        x_label="Max Cycle Size",
+                        filename="cycles-size-runtime")
 
 graph_grouped_bar_plot(experiment_params_list,
-                        x_type=dataset,
+                        x_type=cycle_size,
                         y_type=memory_footprint,
                         ylims=[0, 15.5],
                         y_ticks =[3, 6, 9, 12, 15],
                         dimensions = (600, 400),
-                        y_label="Memory Footprint (MB)",
-                        grouping=cycle_size,
-                        filename="cycles_size_memory")
+                        y_label="Statistics Size (MB)",
+                        x_label="Max Cycle Size",
+                        grouping=dataset,
+                        legend_pos = nothing,
+                        filename="cycles-size-memory")
 
 graph_grouped_bar_plot(experiment_params_list,
-                        x_type=dataset,
+                        x_type=cycle_size,
                         y_type=build_time,
                         ylims = [0, 4],
                         y_ticks = [1, 2, 3, 4],
                         dimensions = (600, 400),
-                        legend_pos = :topright,
                         y_label="Build Time (s)",
-                        grouping=cycle_size,
-                        filename="cycles_size_build_time")
+                        x_label="Max Cycle Size",
+                        grouping=dataset,
+                        legend_pos = nothing,
+                        filename="cycles-size-build-time")
diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl
@@ -218,24 +218,24 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
     # This seems to be necessary for using Plots.jl outside of the ipynb framework.
     # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
     ENV["GKSwstype"]="100"
-    gbplot = groupedboxplot(x_values,
-                            [log10(y)  for y in y_values],
-                            group = estimators,
-                            x_ticks = x_ticks,
-                            xlims = [0.5, length(x_order)+.5],
-                            ylims =  (log10(ylims[1]),log10(ylims[2])),
-                            y_ticks = [log10(y) for y in y_ticks],
-                            legend = legend_pos,
-                            size = dimensions,
-                            bottom_margin = 40px,
-                            top_margin = 20px,
-                            left_margin = 10mm,
-                            legend_column = 2,
-                            titlefont = (12, :black),
-                            legendfont = (11, :black),
-                            tickfont = (12, :black),
-                            guidefont = (15, :black),
-                            whisker_range=2)
+    gbplot =  groupedboxplot(x_values,
+                        [log10(y)  for y in y_values],
+                        group = estimators,
+                        x_ticks = x_ticks,
+                        xlims = [0.5, length(x_order)+.5],
+                        ylims =  (log10(ylims[1]),log10(ylims[2])),
+                        y_ticks = [log10(y) for y in y_ticks],
+                        legend = legend_pos,
+                        size = dimensions,
+                        bottom_margin = 40px,
+                        top_margin = 20px,
+                        left_margin = 10mm,
+                        legend_column = 2,
+                        titlefont = (12, :black),
+                        legendfont = (11, :black),
+                        tickfont = (12, :black),
+                        guidefont = (15, :black),
+                        whisker_range=2)
     x_label !== nothing && xlabel!(gbplot, x_label)
     y_label !== nothing && ylabel!(gbplot, y_label)
     y_type == estimate_error && hline!([0], label="exact", linestyle=:solid, lw=2)

diff --git a/Experiments/run_estimators.jl b/Experiments/run_estimators.jl
@@ -1,30 +1,39 @@
-function run_estimation_experiments(experiment_params_list::Vector{ExperimentParams})
+function run_estimation_experiments(experiment_params_list::Vector{ExperimentParams}; timeout::Float64=Inf)
     for experiment_params in experiment_params_list
         dataset = experiment_params.dataset
+        replications = experiment_params.n_replications
+        println(replications)
         all_queries = load_querysets([dataset]; require_true_cardinality = true)
         summary_file_location = "Experiments/SerializedSummaries/" * params_to_summary_filename(experiment_params)
         !isfile(summary_file_location) && error("The summary has not been built yet! \n Attempted File Location: $(summary_file_location)")
         summary::ColorSummary = deserialize(summary_file_location)
-        experiment_results = SharedArray{Tuple{Float64, Float64, Float64, String255, String255, Float64}}(length(all_queries[dataset]))
+        experiment_results = SharedArray{Tuple{Float64, Float64, Float64, String255, String255, Float64, Bool}}(length(all_queries[dataset]))
         @sync @distributed for i in shuffle(collect(eachindex(experiment_results)))
             query::QueryGraph = all_queries[dataset][i].query
             query_path = all_queries[dataset][i].query_path
             exact_size = all_queries[dataset][i].exact_size
-            estimate_results = [(@timed get_cardinality_bounds(query, summary;
+            estimate_results = [(@timed get_cardinality_bounds(query,
+                                    summary;
                                     max_partial_paths = experiment_params.inference_max_paths,
-                                    use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
+                                    use_partial_sums=experiment_params.use_partial_sums,
+                                    usingStoredStats=true,
                                     sampling_strategy=experiment_params.sampling_strategy,
-                                    only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)) for _ in 1:3]
+                                    only_shortest_path_cycle= experiment_params.only_shortest_path_cycle,
+                                    timeout=timeout)) for _ in 1:replications]
             estimate_time = median([x.time for x in  estimate_results]) # Convert back to seconds from nano seconds
+            estimate_failure = isnan(estimate_results[1].value) || isinf(estimate_results[1].value) || estimate_results[1].value == 0
             estimate = max(1, estimate_results[1].value)
-            if isinf(estimate) || isnan(estimate)
+            if isinf(estimate)
+                estimate = 10^35
+            end
+            if isnan(estimate)
                 estimate = 1.0
             end
             query_type = all_queries[dataset][i].query_type
-            experiment_results[i] = (estimate, exact_size, estimate_time, query_type, query_path, nv(query.graph))
+            experiment_results[i] = (estimate, exact_size, estimate_time, query_type, query_path, nv(query.graph), estimate_failure)
         end
-        final_results = [(x[1], x[2], x[3], String(x[4]), String(x[5]), x[6]) for x in experiment_results]
-        final_results = [("Estimate", "TrueCard", "EstimationTime", "QueryType", "QueryPath", "QuerySize"); final_results]
+        final_results = [(x[1], x[2], x[3], String(x[4]), String(x[5]), x[6], x[7]) for x in experiment_results]
+        final_results = [("Estimate", "TrueCard", "EstimationTime", "QueryType", "QueryPath", "QuerySize", "Failure"); final_results]
         results_file_location = "Experiments/Results/Estimation_"  * params_to_results_filename(experiment_params)
         writedlm(results_file_location, final_results, ",")
     end

diff --git a/Experiments/utils.jl b/Experiments/utils.jl
@@ -13,13 +13,14 @@ struct ExperimentParams
     use_partial_sums::Bool
     sampling_strategy::SAMPLING_STRATEGY
     description::String
+    n_replications::Int
 
     function ExperimentParams(;dataset::DATASET, max_cycle_size=6,
         only_shortest_path_cycle=false, summary_max_paths=1000,
         partitioning_scheme::Vector{Tuple{PARTITIONER, Int}} = [(QuasiStable, 64)], weighting=true,
         inference_max_paths=500, use_partial_sums=true,
         sampling_strategy=redistributive, proportion_updated=0.0, proportion_deleted=0.0,
-        deg_stats_type::Type=AvgDegStats, description="")
+        deg_stats_type::Type=AvgDegStats, description="", n_replications=3)
         return new(dataset,
                     ColorSummaryParams(deg_stats_type=deg_stats_type,
                                         max_cycle_size=max_cycle_size,
@@ -32,7 +33,8 @@ struct ExperimentParams
                     only_shortest_path_cycle,
                     use_partial_sums,
                     sampling_strategy,
-                    description
+                    description,
+                    n_replications
                )
     end
 end