-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.cc
265 lines (233 loc) · 10.6 KB
/
main.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#include "src/common/bigvector.h"
#include "src/common/io.h"
#include "src/common/types.h"
#include "src/pq/benchmark.h"
#include "src/pq/constants.h"
#include "src/pq/utils.h"
#include "src/scalar/scalar.h"
#include "src/common/utils.h"
#include <boost/program_options.hpp>
#include <boost/program_options/value_semantic.hpp>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <optional>
#include <random>
namespace {
void generateBenchmark(const std::string& dataset,
int dimension,
std::size_t numVecs) {
auto root = std::filesystem::path(__FILE__).parent_path();
std::cout << "Writing " << numVecs << " vectors with dimension " << dimension
<< " to " << (root / "data" / (dataset + ".fvec")) << std::endl;
std::vector<float> result(dimension);
std::minstd_rand rng;
std::uniform_real_distribution<float> u01{0.0F, 1.0F};
auto randDocs = [&] {
std::generate_n(result.begin(), dimension, [&] { return u01(rng); });
return result;
};
writeFvecs(root / "data" / (dataset + ".fvec"), dimension, numVecs, randDocs);
std::cout << "Wrote " << numVecs << " vectors with dimension " << dimension
<< " to " << (root / "data" / (dataset + ".fvec")) << std::endl;
}
void loadAndRunPqBenchmark(const std::string& dataset,
Metric metric,
float distanceThreshold,
std::size_t docsPerCoarseCluster,
std::size_t dimensionsPerCode) {
auto root = std::filesystem::path(__FILE__).parent_path();
std::cout << "Loading queries from "
<< (root / "data" / ("queries-" + dataset + ".fvec")) << std::endl;
auto [queries, qdim] = readFvecs(root / "data" / ("queries-" + dataset + ".fvec"));
std::cout << "Loaded " << queries.size() / qdim << " queries of dimension " << qdim << std::endl;
// We pad to a multiple of the number of books so round up.
std::size_t numBooks{(qdim + dimensionsPerCode - 1) / dimensionsPerCode};
zeroPad(qdim, numBooks, queries);
std::cout << "Loading corpus from "
<< (root / "data" / ("corpus-" + dataset + ".fvec")) << std::endl;
BigVector docs{loadAndPrepareData(
root / "data" / ("corpus-" + dataset + ".fvec"), numBooks, metric == Cosine)};
std::cout << "Loaded " << docs.numVectors() << " vectors of dimension " << docs.dim() << std::endl;
if (qdim != docs.dim()) {
throw std::runtime_error("Dimension mismatch");
}
if (docs.numVectors() == 0 || queries.empty()) {
return;
}
runPqBenchmark(dataset, metric, distanceThreshold, docsPerCoarseCluster,
numBooks, 10, docs, queries, writePqStats);
}
void loadAndRunPqMergeBenchmark(const std::string& dataset,
Metric metric,
float distanceThreshold,
std::size_t docsPerCoarseCluster,
std::size_t dimensionsPerCode) {
auto root = std::filesystem::path(__FILE__).parent_path();
std::cout << "Loading queries from "
<< (root / "data" / ("queries-" + dataset + ".fvec")) << std::endl;
auto [queries, qdim] = readFvecs(root / "data" / ("queries-" + dataset + ".fvec"));
std::cout << "Loaded " << queries.size() / qdim << " queries of dimension " << qdim << std::endl;
// We pad to a multiple of the number of books so round up.
std::size_t numBooks{(qdim + dimensionsPerCode - 1) / dimensionsPerCode};
zeroPad(qdim, numBooks, queries);
std::cout << "Loading corpus from "
<< (root / "data" / ("corpus-" + dataset + ".fvec")) << std::endl;
BigVector docs1{loadAndPrepareData(
root / "data" / ("corpus-" + dataset + ".fvec"), numBooks, metric == Cosine, {0.0, 0.5})};
BigVector docs2{loadAndPrepareData(
root / "data" / ("corpus-" + dataset + ".fvec"), numBooks, metric == Cosine, {0.5, 1.0})};
std::cout << "Loaded " << (docs1.numVectors() + docs2.numVectors())
<< " vectors of dimension " << docs1.dim() << std::endl;
if (qdim != docs1.dim()) {
throw std::runtime_error("Dimension mismatch");
}
if (docs1.numVectors() == 0 || docs2.numVectors() == 0 || queries.empty()) {
return;
}
runPqMergeBenchmark(dataset, metric, distanceThreshold, docsPerCoarseCluster,
numBooks, 10, docs1, docs2, queries, writePqStats);
}
void loadAndRunScalarBenchmark(const std::string& dataset, Metric metric, ScalarBits bits) {
auto root = std::filesystem::path(__FILE__).parent_path();
auto [docs, ddim] = readFvecs(root / "data" / ("corpus-" + dataset + ".fvec"));
auto [queries, qdim] = readFvecs(root / "data" / ("queries-" + dataset + ".fvec"));
if (ddim != qdim) {
throw std::runtime_error("Dimension mismatch");
}
if (docs.empty() || queries.empty()) {
return;
}
runScalarBenchmark(dataset, metric, bits, 10, qdim, docs, queries);
}
} // unnamed::
int main(int argc, char* argv[]) {
bool generate{false};
int dimension{1024};
std::size_t numVecs{16UL * 1024UL * 1024UL};
std::optional<ScalarBits> scalar;
Metric metric{Cosine};
bool merge{false};
float distanceThreshold{0.0F};
std::size_t docsPerCoarseCluster{COARSE_CLUSTERING_DOCS_PER_CLUSTER};
std::size_t dimensionsPerCode{8};
std::string dataset;
boost::program_options::options_description desc("Usage: run_benchmark\nOptions");
desc.add_options()
("help,h", "Show this help")
("generate,g", boost::program_options::bool_switch(),
"Generate random data with the specified vector count and dimension")
("dim,d", boost::program_options::value<int>()->default_value(1024),
"The dimension of the data to generate")
("num-vecs,v", boost::program_options::value<std::size_t>()->default_value(16UL * 1024UL * 1024UL),
"The number of document vectors to generate")
("scalar,s", boost::program_options::value<std::string>(),
"Use 1, 4, 4P or 8 bit scalar quantisation. If not supplied then run PQ")
("run,r", boost::program_options::value<std::string>(),
"Run a test dataset")
("metric,m", boost::program_options::value<std::string>()->default_value("cosine"),
"The metric, must be cosine, dot or euclidean with which to compare vectors")
("merge", boost::program_options::bool_switch(),
"Run the merge benchmark instead of the standard benchmark")
("perp-distance-threshold", boost::program_options::value<float>()->default_value(0.0F),
"The ScaNN threshold used for computing the parallel distance cost multiplier")
("docs-per-coarse-cluster", boost::program_options::value<std::size_t>()->default_value(COARSE_CLUSTERING_DOCS_PER_CLUSTER),
"The number of documents per coarse cluster in the PQ index")
("dimensions-per-code", boost::program_options::value<std::size_t>()->default_value(16),
"The number of dimensions per code in the PQ index");
try {
boost::program_options::variables_map vm;
boost::program_options::store(
boost::program_options::parse_command_line(argc, argv, desc), vm);
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cerr << desc << std::endl;
return 0;
}
if (vm.count("generate")) {
generate = vm["generate"].as<bool>();
}
if (vm.count("dim")) {
dimension = vm["dim"].as<int>();
if (dimension <= 0) {
throw boost::program_options::error("Invalid dimension");
}
}
if (vm.count("num-vecs")) {
numVecs = vm["num-vecs"].as<std::size_t>();
if (numVecs == 0) {
throw boost::program_options::error("Invalid number of vectors");
}
}
if (vm.count("scalar")) {
auto s = vm["scalar"].as<std::string>();
if (s == "1") {
scalar = B1;
} else if (s == "4") {
scalar = B4;
} else if (s == "4P") {
scalar = B4P;
} else if (s == "8") {
scalar = B8;
} else if (s != "None") {
throw boost::program_options::error("Invalid scalar quantisation");
}
}
if (vm.count("run")) {
dataset = vm["run"].as<std::string>();
}
if (vm.count("metric")) {
auto m = vm["metric"].as<std::string>();
if (m == "cosine") {
metric = Cosine;
} else if (m == "dot") {
metric = Dot;
} else if (m == "euclidean") {
metric = Euclidean;
} else {
throw boost::program_options::error("Invalid metric");
}
}
if (vm.count("merge")) {
merge = vm["merge"].as<bool>();
}
if (vm.count("distance")) {
distanceThreshold = vm["distance"].as<float>();
}
if (vm.count("docs-per-coarse-cluster")) {
docsPerCoarseCluster = vm["docs-per-coarse-cluster"].as<std::size_t>();
if (docsPerCoarseCluster == 0) {
throw boost::program_options::error("Invalid docs per coarse cluster");
}
}
if (vm.count("dimensions-per-code")) {
dimensionsPerCode = vm["dimensions-per-code"].as<std::size_t>();
if (dimensionsPerCode == 0) {
throw boost::program_options::error("Invalid dimensions per code");
}
}
} catch (const boost::program_options::error& e) {
std::cerr << "Error parsing command line: " << e.what() << std::endl;
std::cerr << desc << std::endl;
return 1;
}
if (generate) {
generateBenchmark(dataset, dimension, numVecs);
} else if (!dataset.empty()) {
try {
if (scalar != std::nullopt) {
loadAndRunScalarBenchmark(dataset, metric, *scalar);
} else if (merge) {
loadAndRunPqMergeBenchmark(dataset, metric, distanceThreshold,
docsPerCoarseCluster, dimensionsPerCode);
} else {
loadAndRunPqBenchmark(dataset, metric, distanceThreshold,
docsPerCoarseCluster, dimensionsPerCode);
}
} catch (const std::exception& e) {
std::cerr << "Caught exception: " << e.what() << std::endl;
return 1;
}
}
return 0;
}