-
Notifications
You must be signed in to change notification settings - Fork 347
/
statistical_tests_utils.h
170 lines (150 loc) · 7.33 KB
/
statistical_tests_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
//
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_
#define DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_
#include <fstream>
#include <functional>
#include <iostream>
#include "google/protobuf/text_format.h"
#include "absl/container/flat_hash_map.h"
#include "absl/log/check.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_cat.h"
#include "algorithms/rand.h"
namespace differential_privacy::testing {
// Sample a value from Laplace distribution, implemented with absl random
// generators (i.e. non-secure).
double SampleReferenceLaplacian(double mean, double variance,
SecureURBG* random);
// Generates number_of_samples samples from both sample_generator_a and
// sample_generator_b generators and decides whether these 2 sets of random
// samples were likely drawn from similar discrete distributions. See
// (broken link) for more information.
bool GenerateClosenessVote(std::function<double()> sample_generator_a,
std::function<double()> sample_generator_b,
int number_of_samples, double l2_tolerance,
double granularity);
template <typename T>
double ComputeApproximateDpTestValue(
absl::flat_hash_map<T, int64_t> histogram_a,
absl::flat_hash_map<T, int64_t> histogram_b, double epsilon,
int num_of_samples) {
double test_value = 0;
for (auto& [result, sample_count] : histogram_a) {
auto it_b = histogram_b.find(result);
if (it_b != histogram_b.end()) {
double sample_count_b = it_b->second;
test_value +=
std::max(0.0, (sample_count - std::exp(epsilon) * sample_count_b) /
num_of_samples);
} else {
test_value += sample_count / num_of_samples;
}
}
return test_value;
}
template <typename T>
absl::flat_hash_map<T, int64_t> BuildHistogram(const std::vector<T>& samples) {
absl::flat_hash_map<T, int64_t> histogram;
for (const T& sample : samples) ++histogram[sample];
return histogram;
}
// Decides whether two sets of random samples were likely drawn from a pair of
// discrete distributions that approximately satisfy (ε,δ) differential privacy.
//
// The two distributions are considered to be (ε,δ) differentially private if
// the likelihood of any event with respect to the first distribution is at most
// δ plus e^ε times the likelihood of the same event in the second distribution
// and vice versa. Moreover, the distributions are considered approximately
// (ε,δ) differentially private if there exists a δ' such that the distributions
// are (ε,δ') differentially private and |δ' - δ| is less than half of a given
// tolerance α. Otherwise if no δ' exists such that |δ' - δ| is less than α, the
// distributions are not considered approximately (ε, δ) differentially private.
// Assuming that α > (m / n)^0.5 * (1 + e^(2 * ε)), the error probability is at
// most (1 + e^(2 * ε)) / (n * (α - (m / n)^0.5 * (1 + e^(2 * ε)))^2), where m
// is the size of the support of the distributions and n is the expected value
// of a Poisson distribution from which the number of samples is drawn. See
// (broken link) for more information.
template <typename T>
bool VerifyApproximateDp(const std::vector<T>& samples_a,
const std::vector<T>& samples_b, double epsilon,
double delta, double delta_tolerance) {
DCHECK(samples_a.size() == samples_b.size())
<< "The sample sets must be of equal size.";
DCHECK(!samples_a.empty()) << "The sample sets must not be empty";
DCHECK(delta_tolerance > 0) << "The delta tolerance must be positive";
DCHECK(delta_tolerance < 1) << "The delta tolerance should be less than 1";
DCHECK(epsilon >= 0) << "Epsilon must not be negative";
DCHECK(delta >= 0) << "Delta must not be negative";
DCHECK(delta < 1) << "Delta should be less than 1";
absl::flat_hash_map<T, int64_t> histogram_a = BuildHistogram(samples_a);
absl::flat_hash_map<T, int64_t> histogram_b = BuildHistogram(samples_b);
double test_value_a = ComputeApproximateDpTestValue(
histogram_a, histogram_b, epsilon, samples_a.size());
double test_value_b = ComputeApproximateDpTestValue(
histogram_b, histogram_a, epsilon, samples_b.size());
return test_value_a < delta + delta_tolerance &&
test_value_b < delta + delta_tolerance;
}
// Generates number_of_samples samples from both sample_generator_a and
// sample_generator_b generators and decides whether two sets of random samples
// were likely drawn from a pair of discrete distributions that approximately
// satisfy (ε,δ) differential privacy. See (broken link) for more
// information. The test will fail if this pair of samples do not satisfy
// (ε,δ + delta_tolerance) differential privacy.
bool GenerateApproximateDpVote(std::function<double()> sample_generator_a,
std::function<double()> sample_generator_b,
int number_of_samples, double epsilon,
double delta, double delta_tolerance,
double granularity);
// Generates number_of_votes of votes from vote_generator to determine a
// majority. Stops early as soon as the majority is clear. Returns the majority.
bool RunBallot(std::function<bool()> vote_generator, int number_of_votes);
// Same as above but in addition stops early if any vote_generator returns an
// error and returns that error. If there are no errors work as above overload.
absl::StatusOr<bool> RunBallot(
std::function<absl::StatusOr<bool>()> vote_generator, int number_of_votes);
template <typename T>
std::optional<T> ReadProto(std::istream* proto_file) {
T tests;
std::string serialized_protobuf;
std::string line;
while (getline(*proto_file, line)) {
absl::StrAppend(&serialized_protobuf, line, "\n");
}
*proto_file >> serialized_protobuf;
if (!google::protobuf::TextFormat::ParseFromString(serialized_protobuf, &tests)) {
return std::optional<T>();
}
return tests;
}
template <typename T>
std::optional<T> ReadProto(const std::string& path) {
std::ifstream proto_file(path);
if (!proto_file.is_open()) {
return std::optional<T>();
}
return ReadProto<T>(&proto_file);
}
// Partitions the interval between lower and upper into num_buckets subintervals
// of equal size and return the index (from 0 to num_buckets - 1) of the
// subinterval that contains the specified sample.
// Samples outside the bounds will be assigned to the lowest or highest bin as
// appropriate. Samples that are exactly equal to bin boundaries will be
// assigned to the higher bin.
int Bucketize(double sample, double lower, double upper, int num_buckets);
} // namespace differential_privacy::testing
#endif // DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_